In [1]:
import m2_utilities.flops as flops
from m2_utilities.model.qwen import load_qwen

%load_ext autoreload
%autoreload 2

In [2]:
model, tokenizer = load_qwen()

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


### Single Forward Pass

In [4]:
N_TOKENS = 512

n_flops = flops.compute_flops(N_TOKENS, backpropagate=True)
print(f"Total FLOPS: {n_flops:.4e}")

Total FLOPS: 1.6955e+12


### Generate 20 Given Context Length of 80

In [5]:
N_CONTEXT = 10
N_GENERATE = 90

n_flops = flops.compute_flops_gen(N_CONTEXT, N_GENERATE, batch_size=150)
print(f"Total FLOPS: {n_flops:.4e}")

Total FLOPS: 5.8470e+14


### Adding Poisitional Embeddings

In [7]:
n_flops = flops.embedding(N_TOKENS, hidden_size=896)
print(f"Embbeding Layer: {n_flops:.2e}")

Embbeding Layer: 4.59e+05


### All Self-Attention Blocks

In [8]:
N_LAYERS = 24
n_flops = flops.block(N_TOKENS, n_heads=14, hidden_size=896, intermediate_size=4864)
print(f"Single Block: {n_flops:.2e}")
print(f"{N_LAYERS} Blocks: {N_LAYERS * n_flops:.2e}")

Single Block: 1.77e+10
24 Blocks: 4.25e+11


### Breakdown of a Single Block

In [9]:
n_flops = flops.ffn(N_TOKENS, hidden_size=896, intermediate_size=4864)
print(f"FFN: {n_flops:.2e}")

n_flops = flops.multi_head_self_attention(N_TOKENS, n_heads=14, hidden_size=896)
print(f"MHSA: {n_flops:.2e}")

n_flops = flops.rms_norm(N_TOKENS, hidden_size=896)
print(f"RMSNorm: {n_flops:.2e}")

n_flops = flops.add_residual(N_TOKENS, hidden_size=896)
print(f"Residual: {n_flops:.2e}")

FFN: 1.34e+10
MHSA: 4.27e+09
RMSNorm: 5.97e+06
Residual: 4.59e+05


### Post Self-Attention Operations

In [10]:
n_flops = flops.final_linear(N_TOKENS, hidden_size=896, vocab_size=151936)
print(f"Final Linear Transform: { n_flops:.2e}")

n_flops = flops.softmax(N_TOKENS, vector_size=151936)
print(f"Final Softmax: {n_flops:.2e}")


Final Linear Transform: 1.39e+11
Final Softmax: 9.33e+08
