In [12]:
import m2_utilities.flops as flops

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Single Forward Pass

In [25]:
N_TOKENS = 512
N_LAYERS = 24
N_HEADS = 14
VOCAB_SIZE = 151646
D_MODEL = 896
HIDDEN_SIZE = 4864

n_flops = flops.compute_flops(N_TOKENS, backpropagate=False)

print(f"Total FLOPS: {n_flops:.2e}")

Total FLOPS: 5.65e+11


### Adding Poisitional Embeddings

In [22]:
n_flops = flops.embedding(N_TOKENS, D_MODEL)
print(f"Embbeding Layer: {n_flops:.2e}")

Embbeding Layer: 4.59e+05


### All Self-Attention Blocks

In [23]:
n_flops = flops.block(N_TOKENS, N_HEADS, D_MODEL, HIDDEN_SIZE)
print(f"Single Block: {n_flops:.2e}")
print(f"{N_LAYERS} Blocks: {N_LAYERS * n_flops:.2e}")

Single Block: 1.77e+10
24 Blocks: 4.25e+11


### Breakdown of a Single Block

In [33]:
n_flops = flops.ffn(N_TOKENS, D_MODEL, HIDDEN_SIZE)
print(f"FFN: {n_flops:.2e}")

n_flops = flops.multi_head_self_attention(N_TOKENS, N_HEADS, D_MODEL)
print(f"MHSA: {n_flops:.2e}")

n_flops = flops.rms_norm(N_TOKENS, D_MODEL)
print(f"RMSNorm: {n_flops:.2e}")

n_flops = flops.add_residual(N_TOKENS, D_MODEL)
print(f"Add Residual: {n_flops:.2e}")

MHSA: 4.27e+09
FFN: 1.34e+10
RMSNorm: 5.97e+06
Add Residual: 4.59e+05


### Post Self-Attention Operations

In [32]:
n_flops = N_TOKENS * flops.final_linear(D_MODEL, VOCAB_SIZE)
print(f"Final Linear Transform: { n_flops:.2e}")

n_flops = N_TOKENS * flops.softmax(VOCAB_SIZE)
print(f"Final Softmax: {n_flops:.2e}")


Final Linear Transform: 1.39e+11
Final Softmax: 9.32e+08
