# Storage and compute analysis of transformers

List of models available https://huggingface.co/models

In [1]:
from dataclasses import dataclass
from typing import Literal
import warnings

In [2]:
def custom_formatwarning(msg, *args, **kwargs):
    # ignore everything except the message
    return 'Warning: ' + str(msg) + '\n'

warnings.formatwarning = custom_formatwarning

In [3]:
def gigabytes(bytes: int) -> float:
    return bytes / 1024 / 1024 / 1024

In [4]:
class Params:

    n_layers:int
    hidden_size:int # feed forward dimension
    n_heads:int
    n_microbatch:int=1 # for pipeline parallelism
    s_length:int=512
    vocab_size:int=50257

    def __init__(self, config):
        self.n_layers = config.n_layers
        self.hidden_size = config.hidden_size
        self.n_heads = config.n_heads
        if hasattr(config, 'n_microbatch'):
            self.n_microbatch = config.n_microbatch
        else:
            warnings.warn("Number of microbatches not found in config, defaulting to 1")
        if hasattr(config, 's_length'):
            self.s_length = config.s_length
        else:
            warnings.warn("Sequence length not found in config, defaulting to 512")
        if hasattr(config, 'vocab_size'):
            self.vocab_size = config.vocab_size
        else:
            warnings.warn("Vocabulary size not found in config, defaulting to 50257")


    def model_params(self) -> int:
        """In number of elements, LayerNorm params are included but may not be
        necessary, since they are negligible compared to other parameters.
        https://insujang.github.io/2022-07-30/analysis-of-transformer-model/#fn:3
        In number of elements
        """

        # Attention block parameters
        params = self.n_layers * (
            3 * self.hidden_size * self.hidden_size +  # Q, K, V
            self.hidden_size * self.hidden_size +      # Output projection
            2 * self.hidden_size                       # LayerNorm (gamma and beta)
        )
        # MLP parameters
        params += self.n_layers * (
            self.hidden_size * 4 * self.hidden_size +  # First linear layer
            4 * self.hidden_size * self.hidden_size +  # Second linear layer
            2 * self.hidden_size                       # LayerNorm (gamma and beta)
        )
        # Embedding parameters
        params += self.vocab_size * self.hidden_size  # Token embeddings
        params += self.s_length * self.hidden_size    # Position embeddings

        return params
    
    def gradients(self) -> int:
        """In number of elements"""
        return self.model_params()
    
    def optimizer_states(self) -> int:
        """Adam stores a FP32 copy of parameters, plus momentum and variance estimates
        In number of elements"""
        return 3 * self.model_params()
    
    def activation_size(self) -> int:
        """Activation size calculation per layer in bytes, taken from NVIDIA's
        analysis, https://arxiv.org/pdf/2205.05198
        In bytes"""

        # Attention activations
        activations = (5*2 +1) * self.s_length * self.hidden_size 
        activations += 5 * self.n_heads * self.s_length * self.s_length
        
        # MLP activations
        activations += (9*2 + 1) * self.s_length * self.hidden_size

        # LayerNorm activations
        activations += (2*2) * self.s_length * self.hidden_size

        return activations * self.n_layers * self.n_microbatch
    
    def total_size(self) -> int:
        """Total size of model parameters, gradients, optimizer states, and activations
        in bytes"""
        total = 2 * self.model_params()  # parameters
        total += 2 * self.gradients()    # gradients
        total += 4 * self.optimizer_states()  # optimizer states
        total += self.activation_size()   # activations
        return total/1024/1024/1024  # in GB

    def __str__(self):
        return (
            f"Properties\n"
            f"  Number of Layers: {self.n_layers}\n"
            f"  Hidden Size     : {self.hidden_size}\n"
            f"  Number of Heads : {self.n_heads}\n"
            f"  Microbatch Size : {self.n_microbatch}\n"
            f"  Sequence Length : {self.s_length}\n"
            f"  Vocabulary Size : {self.vocab_size}\n"
        )

    def print_parameters(self):
        return (
            f"Total Parameters\n"
            f"  Model Parameters: {self.model_params()/1e6:.0f}M elements\n"
            f"                  : {gigabytes(2 * self.model_params()):.2f} GB\n"
            f"  Gradients       : {self.gradients()/1e6:.0f}M elements\n"
            f"                  : {gigabytes(2 * self.gradients()):.2f} GB\n"
            f"  Optimizer States: {self.optimizer_states()/1e6:.0f}M elements\n"
            f"                  : {gigabytes(4 * self.optimizer_states()):.2f} GB\n"
            f"  Activation Size : {gigabytes(self.activation_size()):.2f} GB\n"
            f"  Total Size      : {self.total_size():.2f} GB\n"
        )
    
    def forward_flops_per_token(self) -> int:
        """FLOPs per token calculation for a forward pass
        https://arxiv.org/pdf/2205.05198
        """

        # Attention FLOPs
        flops = self.n_layers * self.n_microbatch * (
            # Q, K, V transformations
            6 * self.hidden_size * self.hidden_size  * self.s_length +
            # Attention matrix calculation
            2 * self.hidden_size * self.s_length * self.s_length  +
            # Attention over values
            2 * self.hidden_size * self.s_length * self.s_length  +
            # Post-attention linear projection
            2 * self.s_length * self.s_length  * self.hidden_size 
        )

        # MLP FLOPs (2 layers)
        flops += self.n_layers * self.s_length * self.n_microbatch * (
            2 * self.hidden_size * 4 * self.hidden_size +  # First linear layer
            2 * 4 * self.hidden_size * self.hidden_size     # Second linear layer
        )

        # logits layer FLOPs
        flops += 2 * self.n_microbatch * self.s_length * self.hidden_size * self.vocab_size

        return flops
    
    def backward_flops_per_token(self) -> int:
        # Backward pass FLOPs is roughly twice the forward pass FLOPs
        # if using activation recomputation, it's 3x
        return 2 * self.forward_flops_per_token()
    
    def print_flops(self):
        fwd_flops = self.forward_flops_per_token()
        bwd_flops = self.backward_flops_per_token()
        total_flops = fwd_flops + bwd_flops

        return (
            f"FLOPs per Token\n"
            f"  Forward Pass : {fwd_flops/1e9:.0f} GFLOPs\n"
            f"  Backward Pass: {bwd_flops/1e9:.0f} GFLOPs\n"
            f"  Total        : {total_flops/1e9:.0f} GFLOPs\n"
        )




In [5]:
@dataclass
class DeepSeekV32Exp:
    # tried decoding from https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp#deepseek-v32-exp
    n_layers:int=62
    hidden_size:int=7168 # feed forward dimension
    n_heads:int=12 # guessing, seems low
    n_microbatch:int=1 # for pipeline parallelism
    s_length:int=512
    vocab_size:int=129280
    
deepseekv32 = Params(DeepSeekV32Exp())
print(deepseekv32)
print(deepseekv32.print_parameters())
print(deepseekv32.print_flops())


Properties
  Number of Layers: 62
  Hidden Size     : 7168
  Number of Heads : 12
  Microbatch Size : 1
  Sequence Length : 512
  Vocabulary Size : 129280

Total Parameters
  Model Parameters: 39159M elements
                  : 72.94 GB
  Gradients       : 39159M elements
                  : 72.94 GB
  Optimizer States: 117477M elements
                  : 437.64 GB
  Activation Size : 8.11 GB
  Total Size      : 591.63 GB

FLOPs per Token
  Forward Pass : 37530 GFLOPs
  Backward Pass: 75060 GFLOPs
  Total        : 112591 GFLOPs



In [6]:
@dataclass
class BERTConfig:
    # Bert Based Uncased, random old example I found in Huggingface
    n_layers: int = 12
    hidden_size: int = 768
    n_heads: int = 12

bert = Params(BERTConfig())
print(bert)
print(bert.print_parameters())
print(bert.print_flops())

Properties
  Number of Layers: 12
  Hidden Size     : 768
  Number of Heads : 12
  Microbatch Size : 1
  Sequence Length : 512
  Vocabulary Size : 50257

Total Parameters
  Model Parameters: 124M elements
                  : 0.23 GB
  Gradients       : 124M elements
                  : 0.23 GB
  Optimizer States: 372M elements
                  : 1.39 GB
  Activation Size : 0.33 GB
  Total Size      : 2.17 GB

FLOPs per Token
  Forward Pass : 134 GFLOPs
  Backward Pass: 267 GFLOPs
  Total        : 401 GFLOPs





In [7]:
@dataclass
class GPT3Small:
    # https://en.wikipedia.org/wiki/GPT-3#GPT-3_models
    # https://www.lesswrong.com/posts/3duR8CrvcHywrnhLo/how-does-gpt-3-spend-its-175b-parameters
    n_layers:int=12
    hidden_size:int=768 # d_model
    n_heads:int=12
    n_microbatch:int=1 # for pipeline parallelism
    s_length:int=2048
    vocab_size:int=50257
    
gpt3small = Params(GPT3Small())
print(gpt3small)
print(gpt3small.print_parameters())
print(gpt3small.print_flops())

Properties
  Number of Layers: 12
  Hidden Size     : 768
  Number of Heads : 12
  Microbatch Size : 1
  Sequence Length : 2048
  Vocabulary Size : 50257

Total Parameters
  Model Parameters: 125M elements
                  : 0.23 GB
  Gradients       : 125M elements
                  : 0.23 GB
  Optimizer States: 375M elements
                  : 1.40 GB
  Activation Size : 3.41 GB
  Total Size      : 5.27 GB

FLOPs per Token
  Forward Pass : 709 GFLOPs
  Backward Pass: 1418 GFLOPs
  Total        : 2127 GFLOPs



In [8]:
@dataclass
class GPT3Medium:
    # https://en.wikipedia.org/wiki/GPT-3#GPT-3_models
    # https://www.lesswrong.com/posts/3duR8CrvcHywrnhLo/how-does-gpt-3-spend-its-175b-parameters
    n_layers:int=24
    hidden_size:int=1024 # d_model
    n_heads:int=16
    s_length:int=2048
    vocab_size:int=50257
    n_microbatch:int=1

gpt3medium = Params(GPT3Medium())
print(gpt3medium)
print(gpt3medium.print_parameters())
print(gpt3medium.print_flops())

Properties
  Number of Layers: 24
  Hidden Size     : 1024
  Number of Heads : 16
  Microbatch Size : 1
  Sequence Length : 2048
  Vocabulary Size : 50257

Total Parameters
  Model Parameters: 356M elements
                  : 0.66 GB
  Gradients       : 356M elements
                  : 0.66 GB
  Optimizer States: 1067M elements
                  : 3.97 GB
  Activation Size : 9.09 GB
  Total Size      : 14.39 GB

FLOPs per Token
  Forward Pass : 1963 GFLOPs
  Backward Pass: 3926 GFLOPs
  Total        : 5889 GFLOPs



In [9]:
@dataclass
class GPT3XL:
    # https://en.wikipedia.org/wiki/GPT-3#GPT-3_models
    # https://www.lesswrong.com/posts/3duR8CrvcHywrnhLo/how-does-gpt-3-spend-its-175b-parameters
    n_layers:int=24
    hidden_size:int=2048 # d_model
    n_heads:int=24
    s_length:int=2048
    vocab_size:int=50257
    n_microbatch:int=1

gpt3xl = Params(GPT3XL())
print(gpt3xl)
print(gpt3xl.print_parameters())
print(gpt3xl.print_flops())

Properties
  Number of Layers: 24
  Hidden Size     : 2048
  Number of Heads : 24
  Microbatch Size : 1
  Sequence Length : 2048
  Vocabulary Size : 50257

Total Parameters
  Model Parameters: 1315M elements
                  : 2.45 GB
  Gradients       : 1315M elements
                  : 2.45 GB
  Optimizer States: 3946M elements
                  : 14.70 GB
  Activation Size : 14.44 GB
  Total Size      : 34.04 GB

FLOPs per Token
  Forward Pass : 6194 GFLOPs
  Backward Pass: 12388 GFLOPs
  Total        : 18582 GFLOPs



In [10]:
@dataclass
class GPT3175B:
    # https://en.wikipedia.org/wiki/GPT-3#GPT-3_models
    # https://www.lesswrong.com/posts/3duR8CrvcHywrnhLo/how-does-gpt-3-spend-its-175b-parameters
    n_layers:int=96
    hidden_size:int=12288 # d_model
    n_heads:int=96
    s_length:int=2048
    vocab_size:int=50257
    n_microbatch:int=1

gpt3175B = Params(GPT3175B())
print(gpt3175B)
print(gpt3175B.print_parameters())
print(gpt3175B.print_flops())

Properties
  Number of Layers: 96
  Hidden Size     : 12288
  Number of Heads : 96
  Microbatch Size : 1
  Sequence Length : 2048
  Vocabulary Size : 50257

Total Parameters
  Model Parameters: 174594M elements
                  : 325.21 GB
  Gradients       : 174594M elements
                  : 325.21 GB
  Optimizer States: 523781M elements
                  : 1951.24 GB
  Activation Size : 256.50 GB
  Total Size      : 2858.15 GB

FLOPs per Token
  Forward Pass : 685326 GFLOPs
  Backward Pass: 1370652 GFLOPs
  Total        : 2055979 GFLOPs

