In [1]:
def calculate_gpt_parameters(
    num_layers: int,
    hidden_size: int,
    vocab_size: int,
    ff_multiplier: int = 4
) -> int:
    """
    Calculates total number of parameters in a GPT model using Q-K-V matrix approach.

    Parameters:
        num_layers (int): Number of transformer blocks.
        hidden_size (int): Dimensionality of the hidden embeddings (d_model).
        vocab_size (int): Size of the tokenizer vocabulary.
        ff_multiplier (int): Expansion factor for feedforward network (default = 4).

    Returns:
        int: Total number of parameters.
    """

    d = hidden_size
    d_ff = ff_multiplier * d

    # Attention: Q, K, V, and output projection
    attention_params = 4 * d * d

    # Feedforward Network: two linear layers
    ffn_params = 2 * d * d_ff

    # LayerNorm: gain and bias for 2 norms per block
    layernorm_params = 4 * d

    # Per transformer block
    per_layer_params = attention_params + ffn_params + layernorm_params

    # Total for all layers
    transformer_total = num_layers * per_layer_params

    # Token embedding + output projection (usually tied/shared)
    embedding_total = 2 * vocab_size * d

    # Final total
    total_params = transformer_total + embedding_total

    return total_params


# Example: GPT-3 175B
num_layers = 96
hidden_size = 12288
vocab_size = 50257

total_params = calculate_gpt_parameters(num_layers, hidden_size, vocab_size)
print(f"Total GPT-3 Parameters: {total_params:,} (~{total_params/1e9:.2f}B)")


Total GPT-3 Parameters: 175,186,010,112 (~175.19B)
