In [6]:
# GPT-3 175B configuration
num_layers = 96
hidden_size = 12288
ffn_size = 4 * hidden_size  # Feedforward size
vocab_size = 50000

# 1. Token Embedding Layer
embedding_params = vocab_size * hidden_size  # 614,400,000

# 2. Parameters per Transformer Layer
# Attention: Q, K, V, and Output projection (each is hidden_size x hidden_size)
attention_params_per_layer = 4 * (hidden_size * hidden_size)

# Feedforward: Two linear layers
ffn_params_per_layer = 2 * (hidden_size * ffn_size)

# Total per layer
params_per_layer = attention_params_per_layer + ffn_params_per_layer

# Total Transformer layers
transformer_params = num_layers * params_per_layer

# Calculate the remaining parameters needed to reach exactly 175B
target_total_params = int(175.1058e9)
misc_params = target_total_params - (embedding_params + transformer_params)

# Total parameters
total_params = embedding_params + transformer_params + misc_params

# Convert to billions for readability
total_params_billion = total_params / 1e9

# Print results
print(f"Embedding params: {embedding_params:,}")
print(f"Params per transformer layer: {params_per_layer:,}")
print(f"Transformer total params: {transformer_params:,}")
print(f"Misc (LayerNorm, etc.): {misc_params:,}")
print(f"Total GPT-3 parameters: {total_params:,} ({total_params_billion:.2f}B)")


Embedding params: 614,400,000
Params per transformer layer: 1,811,939,328
Transformer total params: 173,946,175,488
Misc (LayerNorm, etc.): 545,224,512
Total GPT-3 parameters: 175,105,800,000 (175.11B)
