In [1]:
import os
import sys
import yaml
from collections import OrderedDict
sys.path.append("..")
from llm.tokenizer import LBPETokenizer


CONFIG_TRAIN_PATH = os.path.abspath(
    os.path.join(os.path.pardir, "llm","config", "train.yml")
)

def load_config(path: str) -> dict:
    with open(path, "r") as file:
        config = yaml.safe_load(file)
    return config

def params(
    n_embd: int,
    block_size: int,
    vocab_size: int,
    n_layer: int
           ):
    """ estimates the number of parameters in the model"""
    out = OrderedDict()

    # token and position embeddings
    out['emebedding/position'] = n_embd * block_size
    out['embedding/token'] = n_embd * vocab_size
    out['embedding'] = out['emebedding/position'] + out['embedding/token']

    # attention blocks
    out['attention/ln'] = n_embd # note, bias=False in our LN
    out['attention/kqv'] = n_embd * 3*n_embd
    out['attention/proj'] = n_embd**2
    out['attention'] = out['attention/ln'] + out['attention/kqv'] + out['attention/proj']

    # MLP blocks
    ffw_size = 4*n_embd # feed forward size
    out['mlp/ln'] = n_embd
    out['mlp/ffw'] = n_embd * ffw_size
    out['mlp/proj'] = ffw_size * n_embd
    out['mlp'] = out['mlp/ln'] + out['mlp/ffw'] + out['mlp/proj']
    
    # the transformer and the rest of it
    out['block'] = out['attention'] + out['mlp']
    out['transformer'] = n_layer * out['block']
    out['ln_f'] = n_embd # final layernorm
    out['dense'] = 0 # 0 because of parameter sharing. This layer uses the weights from the embedding layer

    # total
    out['total'] = out['embedding'] + out['transformer'] + out['ln_f'] + out['dense']

    return out

def flops(
    n_embd: int,
    block_size: int,
    vocab_size: int,
    n_layer: int,
    n_head: int
    ):
    # we only count Weight FLOPs, all other layers (LayerNorm, Softmax, etc) are effectively irrelevant
    # we count actual FLOPs, not MACs. Hence 2* all over the place
    # basically for any matrix multiply A (BxC) @ B (CxD) -> (BxD) flops are 2*B*C*D

    out = OrderedDict()
    head_size = n_embd // n_head

    # attention blocks
    # 1) the projection to key, query, values
    out['attention/kqv'] = 2 * block_size * (n_embd * 3*n_embd)
    # 2) calculating the attention scores
    out['attention/scores'] = 2 * block_size * block_size * n_embd
    # 3) the reduction of the values (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
    out['attention/reduce'] = 2 * n_head * (block_size * block_size * head_size)
    # 4) the final linear projection
    out['attention/proj'] = 2 * block_size * (n_embd * n_embd)
    out['attention'] = sum(out['attention/'+k] for k in ['kqv', 'scores', 'reduce', 'proj'])

    # MLP blocks
    ffw_size = 4*n_embd # feed forward size
    out['mlp/ffw1'] = 2 * block_size * (n_embd * ffw_size)
    out['mlp/ffw2'] = 2 * block_size * (ffw_size * n_embd)
    out['mlp'] = out['mlp/ffw1'] + out['mlp/ffw2']

    # the transformer and the rest of it
    out['block'] = out['attention'] + out['mlp']
    out['transformer'] = n_layer * out['block']
    out['dense'] = 2 * block_size * (n_embd * vocab_size)

    # forward,backward,total
    out['forward_total'] = out['transformer'] + out['dense']
    out['backward_total'] = 2 * out['forward_total'] # use common estimate of bwd = 2*fwd
    out['total'] = out['forward_total'] + out['backward_total']

    return out

# now here is an estimate copy pasted from the PaLM paper
# this formula is often used to calculate MFU (model flops utilization)
def palm_flops(n_embd: int,
    block_size: int,
    vocab_size: int,
    n_layer: int,
    n_head: int):
    """estimate of the model flops following PaLM paper formula"""
    # non-embedding model parameters. note that we do not subtract the
    # embedding/token params because those are tied and get used in the last layer.
    N = params(n_embd=n_embd,
    block_size=block_size,
    vocab_size=vocab_size,
    n_layer=n_layer)['total'] - params(n_embd=n_embd,
        block_size=block_size,
        vocab_size=vocab_size,
        n_layer=n_layer)['emebedding/position']
    L, H, Q, T = n_layer, n_head, n_embd//n_head, block_size
    mf_per_token = 6*N + 12*L*H*Q*T
    mf = mf_per_token * block_size
    return mf

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = load_config(CONFIG_TRAIN_PATH)

In [3]:
tokenizer = LBPETokenizer()
tokenizer.load(model_file='../llm/LBPE.model')

In [4]:
len(tokenizer.vocab)

264

In [5]:
block_size = config["DecoderTransformer"]["context_size"]
vocab_size = 276
n_layer = config["DecoderTransformer"]["n_layers"]
n_head = config["DecoderTransformer"]["num_heads"]
n_embd = config["DecoderTransformer"]["embedding_dim"]
bias = config["DecoderTransformer"]["qkv_bias"]
assert not bias, "this notebook assumes bias=False just for simplicity"

In [6]:
p1 = 436_778_628 #/ 1e9
print(f"# Params: {p1/1e9} B")

# Params: 0.436778628 B


In [7]:
# compare our param count to that reported by PyTorch
p = params(
    n_embd=n_embd,
    block_size=block_size,
    vocab_size=vocab_size,
    n_layer=n_layer
)
params_total = p['total']
print(f"we see: {params_total}, expected: {p1}, match: {params_total == p1}")
# create a header
print(f"{'name':20s} {'params':10s} {'ratio (%)':10s}")
for k,v in p.items():
    print(f"{k:20s} {v:10d} {v/params_total*100:10.4f}")

we see: 85362432, expected: 436778628, match: False
name                 params     ratio (%) 
emebedding/position      196608     0.2303
embedding/token          211968     0.2483
embedding                408576     0.4786
attention/ln                768     0.0009
attention/kqv           1769472     2.0729
attention/proj           589824     0.6910
attention               2360064     2.7648
mlp/ln                      768     0.0009
mlp/ffw                 2359296     2.7639
mlp/proj                2359296     2.7639
mlp                     4719360     5.5286
block                   7079424     8.2934
transformer            84953088    99.5205
ln_f                        768     0.0009
dense                         0     0.0000
total                  85362432   100.0000


In [8]:
# we can now calculate the size of each checkpoint
# params are stored in fp32, and the AdamW optimizer has 2 additional buffers per param for statistics
params_bytes = params_total*4
params_and_buffers_bytes = params_bytes + 2*params_bytes
print(f"best checkpoint size: {params_and_buffers_bytes/1e9:.2f} GB")
measured_bytes = 1542470366 # from wc -c ckpt.pt
print(f"measured with wc -c ckpt.pt: {measured_bytes}")
print(f"fluff ratio: {measured_bytes/params_and_buffers_bytes*100:.2f}%")

est checkpoint size: 1.02 GB
measured with wc -c ckpt.pt: 1542470366
fluff ratio: 150.58%


In [19]:

gpu_memory = 48e9 # 40 GB A100 GPU, roughly
print(f"memory ratio taken up just for parameters: {params_and_buffers_bytes / gpu_memory * 100:.2f}%")

memory ratio taken up just for parameters: 2.13%


In [10]:
# compare our param count to that reported by PyTorch
f = flops(
    n_embd=n_embd,
    block_size=block_size,
    vocab_size=vocab_size,
    n_layer=n_layer,
    n_head=n_head
)
flops_total = f['forward_total']
print(f"{'name':20s} {'flops':14s} {'ratio (%)':10s}")
for k,v in f.items():
    print(f"{k:20s} {v:14d} {v/flops_total*100:10.4f}")

name                 flops          ratio (%) 
attention/kqv             905969664     1.9690
attention/scores          100663296     0.2188
attention/reduce          100663296     0.2188
attention/proj            301989888     0.6563
attention                1409286144     3.0629
mlp/ffw1                 1207959552     2.6254
mlp/ffw2                 1207959552     2.6254
mlp                      2415919104     5.2507
block                    3825205248     8.3137
transformer             45902462976    99.7641
dense                     108527616     0.2359
forward_total           46010990592   100.0000
backward_total          92021981184   200.0000
total                  138032971776   300.0000


In [11]:
pf =  palm_flops(n_embd=n_embd,
    block_size=block_size,
    vocab_size=vocab_size,
    n_layer=n_layer,
    n_head=n_head
    )


print(f"palm_flops: {pf:d}, flops: {f['total']:d}, ratio: {pf/f['total']:.4f}")

palm_flops: 138062462976, flops: 138032971776, ratio: 1.0002


In [18]:
# here is what we currently roughly measure
batch_size = 20 * 5 # 5 is grad_accum, so total batch size is 100
measured_time = 0.755 # in seconds per iteration
measured_throughput = batch_size / measured_time
flops_achieved = f['total'] * measured_throughput

# A100 is cited to be 312 TFLOPS of bloat16 running on tensor cores
a100_flops_promised = 309.7e12 

# the fraction of the A100 that we are using:
print(f"fraction of A100 used: {flops_achieved / a100_flops_promised * 100:.2f}%")

fraction of A100 used: 5.90%


In [17]:
# Finally let's check out the 6ND approximation as total cost of training in FLOPs
n_gpus = 3
model_size = p['total'] # this is number of parameters, N
tokens_num = 300e9 # 300B tokens, this is dataset size in tokens, D
a100_flops = 309.7e12 # 309.7 tflops (nvidia a 6000)
assumed_mfu = 0.8 # assume this model flops utilization (take the current 37% from above and add some DDP overhead)
flops_throughput = a100_flops * n_gpus * assumed_mfu # assume an 3xa6000
flops_needed = 6 * model_size * tokens_num # 6ND
time_needed_s = flops_needed / flops_throughput # in seconds
print(f"time needed to train the model: {time_needed_s/3600/24:.2f} days")

time needed to train the model: 2.39 days
