In [1]:
import torch
import torch.nn as nn
from torchsummary import summary

def calculate_memory_usage(model, batch_size, input_size):
    """
    Calculate the memory usage for a given model, batch size, and input size.
    
    Parameters:
    - model: The neural network model (PyTorch model).
    - batch_size: The size of the batches for training/inference.
    - input_size: The size of the input tensor (e.g., (channels, height, width) for images).
    
    Returns:
    - ram_usage: Estimated RAM usage in bytes.
    - gpu_usage: Estimated GPU memory usage in bytes.
    """
    # Ensure the model is on the CPU for initial RAM estimation
    model = model.cpu()
    
    # Create a dummy input tensor with the specified batch size and input size
    dummy_input = torch.randn(batch_size, *input_size)
    
    # Estimate the model size (parameters)
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    
    # Estimate the forward pass activation size
    def forward_hook(module, input, output):
        activations.append(output)
    
    activations = []
    hooks = []
    for layer in model.modules():
        if isinstance(layer, nn.Module):
            hooks.append(layer.register_forward_hook(forward_hook))
    
    # Perform a forward pass to populate the activations list
    model(dummy_input)
    
    # Remove the hooks
    for hook in hooks:
        hook.remove()
    
    activation_size = sum(act.numel() * act.element_size() for act in activations)
    
    # Estimate the backward pass activation size (same as forward pass)
    backward_activation_size = activation_size
    
    # Total RAM usage (model parameters + activations for forward and backward pass)
    ram_usage = param_size + (activation_size + backward_activation_size)
    
    # GPU usage estimation (if using a GPU)
    if torch.cuda.is_available():
        model = model.cuda()
        dummy_input = dummy_input.cuda()
        
        # Clear any existing memory
        torch.cuda.empty_cache()
        
        # Perform a forward and backward pass to estimate GPU memory usage
        gpu_memory_before = torch.cuda.memory_allocated()
        output = model(dummy_input)
        loss = output.sum()
        loss.backward()
        gpu_memory_after = torch.cuda.memory_allocated()
        
        gpu_usage = gpu_memory_after - gpu_memory_before

        # #Free GPU resources
        del dummy_input
        del output
        del loss
        model = model.cpu()
        torch.cuda.empty_cache()
    else:
        gpu_usage = None
    
    return ram_usage, gpu_usage

# Example usage
if __name__ == "__main__":
    # Define a simple model for demonstration
    class SimpleModel(nn.Module):
        def __init__(self):
            super(SimpleModel, self).__init__()
            self.fc1 = nn.Linear(784, 256)
            self.fc2 = nn.Linear(256, 128)
            self.fc3 = nn.Linear(128, 10)
        
        def forward(self, x):
            x = torch.flatten(x, 1)
            x = torch.relu(self.fc1(x))
            x = torch.relu(self.fc2(x))
            x = self.fc3(x)
            return x
    
    model = SimpleModel()
    batch_size = 32
    input_size = (1, 28, 28)  # Example for MNIST dataset
    
    ram_usage, gpu_usage = calculate_memory_usage(model, batch_size, input_size)
    
    print(f"Estimated RAM usage: {ram_usage / (1024 ** 2):.2f} MB")
    print(f"Estimated GPU usage: {gpu_usage / (1024 ** 2):.2f} MB" if gpu_usage else "GPU not available")



Estimated RAM usage: 1.00 MB
Estimated GPU usage: 17.15 MB


In [2]:
model.cpu()


SimpleModel(
  (fc1): Linear(in_features=784, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)

In [3]:
torch.cuda.empty_cache()

In [1]:
import torch
import torch.nn as nn

class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = SimpleModel()

# Calculate the number of parameters
num_params = sum(p.numel() for p in model.parameters())
model_size_MB = num_params * 4 / (1024 ** 2)  # Model size in MB
print(f"Model size: {model_size_MB:.2f} MB")


Model size: 0.90 MB


In [4]:
import torch
import torch.nn as nn

def calculate_memory_usage(model, batch_size, input_size):
    """
    Calculate the memory usage for a given model, batch size, and input size.
    
    Parameters:
    - model: The neural network model (PyTorch model).
    - batch_size: The size of the batches for training/inference.
    - input_size: The size of the input tensor (e.g., (channels, height, width) for images).
    
    Returns:
    - ram_usage_training: Estimated RAM usage in bytes during training.
    - ram_usage_inference: Estimated RAM usage in bytes during inference.
    - gpu_usage_training: Estimated GPU memory usage in bytes during training.
    - gpu_usage_inference: Estimated GPU memory usage in bytes during inference.
    """
    # Ensure the model is on the CPU for initial RAM estimation
    model = model.cpu()
    
    # Create a dummy input tensor with the specified batch size and input size
    dummy_input = torch.randn(batch_size, *input_size)
    
    # Estimate the model size (parameters)
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    
    # Estimate the forward pass activation size
    def forward_hook(module, input, output):
        activations.append(output)
    
    activations = []
    hooks = []
    for layer in model.modules():
        if isinstance(layer, nn.Module):
            hooks.append(layer.register_forward_hook(forward_hook))
    
    # Perform a forward pass to populate the activations list
    model(dummy_input)
    
    # Remove the hooks
    for hook in hooks:
        hook.remove()
    
    activation_size = sum(act.numel() * act.element_size() for act in activations)
    
    # Estimate the backward pass activation size (same as forward pass)
    backward_activation_size = activation_size
    
    # Total RAM usage during training (model parameters + activations for forward and backward pass)
    ram_usage_training = param_size + (activation_size + backward_activation_size)
    
    # Total RAM usage during inference (model parameters + activations for forward pass only)
    ram_usage_inference = param_size + activation_size
    
    # GPU usage estimation (if using a GPU)
    if torch.cuda.is_available():
        model = model.cuda()
        dummy_input = dummy_input.cuda()
        
        # Clear any existing memory
        torch.cuda.empty_cache()
        
        # Perform a forward pass to estimate GPU memory usage during inference
        gpu_memory_before = torch.cuda.memory_allocated()
        output = model(dummy_input)
        gpu_memory_after = torch.cuda.memory_allocated()
        
        gpu_usage_inference = gpu_memory_after - gpu_memory_before
        
        # Perform a backward pass to estimate GPU memory usage during training
        loss = output.sum()
        gpu_memory_before = torch.cuda.memory_allocated()
        loss.backward()
        gpu_memory_after = torch.cuda.memory_allocated()
        
        gpu_usage_training = gpu_memory_after - gpu_memory_before
        
        # Free GPU resources
        del dummy_input
        del output
        del loss
        model = model.cpu()
        torch.cuda.empty_cache()
    else:
        gpu_usage_training = None
        gpu_usage_inference = None
    
    return ram_usage_training, ram_usage_inference, gpu_usage_training, gpu_usage_inference

# Example usage
if __name__ == "__main__":
    # Define a simple model for demonstration
    class SimpleModel(nn.Module):
        def __init__(self):
            super(SimpleModel, self).__init__()
            self.fc1 = nn.Linear(784, 256)
            self.fc2 = nn.Linear(256, 128)
            self.fc3 = nn.Linear(128, 10)
        
        def forward(self, x):
            x = torch.flatten(x, 1)
            x = torch.relu(self.fc1(x))
            x = torch.relu(self.fc2(x))
            x = self.fc3(x)
            return x
    
    model = SimpleModel()
    batch_size = 32
    input_size = (1, 28, 28)  # Example for MNIST dataset
    
    # ram_usage_training, ram_usage_inference, gpu_usage_training, gpu_usage_inference = calculate_memory_usage(model, batch_size, input_size)
    
    # print(f"Estimated RAM usage during training: {ram_usage_training / (1024 ** 2):.2f} MB")
    # print(f"Estimated RAM usage during inference: {ram_usage_inference / (1024 ** 2):.2f} MB")
    # print(f"Estimated GPU usage during training: {gpu_usage_training / (1024 ** 2):.2f} MB" if gpu_usage_training else "GPU not available")
    # print(f"Estimated GPU usage during inference: {gpu_usage_inference / (1024 ** 2):.2f} MB" if gpu_usage_inference else "GPU not available")


In [1]:
import tensorflow as tf
import numpy as np

# Example model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, input_shape=(784,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

# Generate example data
num_samples = 1000
x_train = np.random.rand(num_samples, 784)
y_train = np.random.randint(10, size=num_samples)

# Profiling with TensorBoard
tf.profiler.experimental.start('logdir')

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5)

tf.profiler.experimental.stop()


2024-06-07 12:21:47.215868: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-06-07 12:21:48.339170: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-07 12:21:48.356520: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-06-07 12:21:48.387296: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:104] Profiler session initializing.
2024-06-07 12:21:48.387314: I external/local_tsl/tsl/profiler/lib/profiler_se

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0928 - loss: 2.6075   
Epoch 2/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.1472 - loss: 2.3491 
Epoch 3/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.1347 - loss: 2.2571 
Epoch 4/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.1770 - loss: 2.2080 
Epoch 5/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.2131 - loss: 2.1673 


2024-06-07 12:21:49.217694: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:70] Profiler session collecting data.
2024-06-07 12:21:49.218570: E external/local_xla/xla/backends/profiler/gpu/cupti_error_manager.cc:153] cuptiEnableCallback: ignored due to a previous error.
2024-06-07 12:21:49.218583: E external/local_xla/xla/backends/profiler/gpu/cupti_error_manager.cc:223] cuptiGetResultString: ignored due to a previous error.
2024-06-07 12:21:49.218587: E external/local_xla/xla/backends/profiler/gpu/cupti_tracer.cc:1310] function cupti_interface_->EnableCallback( 0 , subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid)failed with error 
2024-06-07 12:21:49.218592: E external/local_xla/xla/backends/profiler/gpu/cupti_error_manager.cc:142] cuptiFinalize: ignored due to a previous error.
2024-06-07 12:21:49.218595: E external/local_xla/xla/backends/profiler/gpu/cupti_error_manager.cc:223] cuptiGetResultString: ignored due to a previous error.
2024-06-07 12:21:49.218598: E external/loca

In [3]:
!tensorboard --logdir=logdir


2024-06-07 12:22:43.210220: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-07 12:22:44.286313: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-07 12:22:44.302389: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platf

In [5]:
from torch.profiler import profile, record_function, ProfilerActivity

input_data = torch.randn(batch_size, *input_size)

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        output = model(input_data)

print(prof.key_averages().table(sort_by="cuda_memory_usage", row_limit=10))


INFO:2024-06-07 12:25:21 3241325:3241325 init.cpp:169] If you see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti
STAGE:2024-06-07 12:25:21 3241325:3241325 ActivityProfilerController.cpp:314] Completed Stage: Warm Up


----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
       model_inference        20.15%     327.000us       100.00%       1.623ms       1.623ms             1  
         aten::flatten         5.36%      87.000us         6.35%     103.000us     103.000us             1  
            aten::view         0.99%      16.000us         0.99%      16.000us      16.000us             1  
          aten::linear         5.11%      83.000us        70.24%       1.140ms     380.000us             3  
               aten::t         1.66%      27.000us         3.27%      53.000us      17.667us             3  
       aten::transpose         1.23%      20.000us         1.60%      26.000us       8.667us             3  
      aten::as_stri

STAGE:2024-06-07 12:25:22 3241325:3241325 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-06-07 12:25:22 3241325:3241325 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [7]:
from memory_profiler import profile

@profile
def train_model():
    output = model(input_data)

train_model()


ERROR: Could not find file /tmp/ipykernel_3241325/2924412536.py
