Ref: https://www.youtube.com/watch?v=-2ebSQROew4

In [40]:
import torch, gc, inspect, transformers
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from accelerate.utils import set_seed
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import json

In [41]:
transformers.logging.set_verbosity_warning()

In [42]:
# set visible device to 1
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = 'cuda'

# Measuring Memory

In [43]:
def cleanup():
    "Free up memory and reset stats"
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats(device)
cleanup()

In [44]:
def print_memory_stats():
    """Print two different measures of GPU memory usage"""
    max_mem_allocated = torch.cuda.max_memory_allocated(device)/1e9
    max_mem_reserved = torch.cuda.max_memory_reserved(device)/1e9
    print(f"Max memory allocated: {max_mem_allocated:.2f} GB")
    # reserved (aka 'max_memory_cached') is ~the allocated memory plus pre-cached memory
    print(f"Max memory reserved: {max_mem_reserved:.2f} GB")

    return max_mem_allocated, max_mem_reserved

max_mem_allocated, max_mem_reserved = print_memory_stats()

Max memory allocated: 0.03 GB
Max memory reserved: 0.03 GB


In [45]:
# cleanup()

# model_path = '/NS/llm-1/nobackup/vnanda/llm_base_models/pythia-1b'

# model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

# print("Model loaded")

# batch_size = 1
# context_length = 1024

# data = torch.randint(0, 10000, (batch_size, context_length), device=device)

# output = model(data, labels=data)

# output.loss.backward()

# # Print memory stats
# print_memory_stats()

# # Cleanup
# del model, data, output
# cleanup()

In [46]:
model_path = '/NS/llm-1/nobackup/vnanda/llm_base_models/pythia-12b'
MODEL = model_path.split('/')[-1]
measurements = {}
BATCH_SIZES = [1,2,4,6,8]
CONTEXT_LENGTHS = [1,2,4,6,8,16,32,64,128,256,512,1024,2048]

In [47]:
import os

available_measurements = os.listdir('Measurements')

print(available_measurements)

['memory_measurements_forward_only_pythia-6.9b.json', 'memory_measurements_pythia-1b.json', 'memory_measurements_pythia-70m.json', 'memory_measurements_forward_only_pythia-70m.json', 'memory_measurements_forward_only_pythia-1b.json', 'memory_measurements_pythia-6.9b.json']


In [48]:
cleanup()

In [49]:
if 'memory_measurements_forward_only_' + MODEL + '.json' in available_measurements:
    
    with open('Measurements/memory_measurements_forward_only_' + MODEL + '.json', 'r') as f:
        measurements = json.load(f)

    print("Loaded measurements")
    
    keys = list(measurements.keys())

    # check if all batch sizes are present
    for bs in BATCH_SIZES:
        if bs not in keys:
            print(f"Batch size {bs} not present")
            continue
        for cl in CONTEXT_LENGTHS:
            if cl not in measurements[bs]:
                print(f"Context length {cl} not present for batch size {bs}")
                continue

else:
    for batch_size in BATCH_SIZES:
        measurements[batch_size] = {} if batch_size not in measurements else measurements[batch_size]
        for context_length in CONTEXT_LENGTHS:

            if context_length in measurements[batch_size]:
                print(f"Batch size: {batch_size}, Context length: {context_length} already measured")
                print(f"Max Memory Allocated: {measurements[batch_size][context_length][0]} GB")
                print(f"Max Memory Reserved: {measurements[batch_size][context_length][1]} GB")
                continue
            
            try:
                print(f"Batch size: {batch_size}, Context length: {context_length}")
                
                cleanup()

                model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

                print("Model loaded")

                data = torch.randint(0, 10000, (batch_size, context_length), device=device)

                output = model(data, labels=data)

                # output.loss.backward()

                # Print memory stats
                max_mem_allocated, max_mem_reserved = print_memory_stats()

                measurements[batch_size][context_length] = (max_mem_allocated, max_mem_reserved)

                # Cleanup
                del model, data, output
                cleanup()

            except Exception as e:
                print(f"Batch size: {batch_size}, Context length: {context_length} failed")
                print(e)
                measurements[batch_size][context_length] = (100, 100)
                try:
                    del model
                except:
                    pass
                try:
                    del data
                except:
                    pass
                try:
                    del output
                except:
                    pass
                cleanup()

Batch size: 1, Context length: 1


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


Model loaded
Max memory allocated: 47.60 GB
Max memory reserved: 47.62 GB
Batch size: 1, Context length: 2


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]


Model loaded
Max memory allocated: 47.61 GB
Max memory reserved: 47.63 GB
Batch size: 1, Context length: 4


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Max memory allocated: 47.64 GB
Max memory reserved: 47.66 GB
Batch size: 1, Context length: 6


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.35s/it]


Model loaded
Max memory allocated: 47.67 GB
Max memory reserved: 47.68 GB
Batch size: 1, Context length: 8


Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.67s/it]


Model loaded
Max memory allocated: 47.69 GB
Max memory reserved: 47.71 GB
Batch size: 1, Context length: 16


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]


Model loaded
Max memory allocated: 47.80 GB
Max memory reserved: 47.83 GB
Batch size: 1, Context length: 32


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.54s/it]


Model loaded
Max memory allocated: 48.00 GB
Max memory reserved: 48.03 GB
Batch size: 1, Context length: 64


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Max memory allocated: 48.42 GB
Max memory reserved: 48.43 GB
Batch size: 1, Context length: 128


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.44s/it]


Model loaded
Max memory allocated: 49.25 GB
Max memory reserved: 49.28 GB
Batch size: 1, Context length: 256


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


Model loaded
Max memory allocated: 50.95 GB
Max memory reserved: 50.97 GB
Batch size: 1, Context length: 512


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]


Model loaded
Max memory allocated: 54.24 GB
Max memory reserved: 54.28 GB
Batch size: 1, Context length: 1024


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


Model loaded
Max memory allocated: 60.90 GB
Max memory reserved: 60.96 GB
Batch size: 1, Context length: 2048


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


Model loaded
Max memory allocated: 74.20 GB
Max memory reserved: 74.34 GB
Batch size: 2, Context length: 1


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]


Model loaded
Max memory allocated: 47.61 GB
Max memory reserved: 47.63 GB
Batch size: 2, Context length: 2


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]


Model loaded
Max memory allocated: 47.64 GB
Max memory reserved: 47.66 GB
Batch size: 2, Context length: 4


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


Model loaded
Max memory allocated: 47.69 GB
Max memory reserved: 47.71 GB
Batch size: 2, Context length: 6


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]


Model loaded
Max memory allocated: 47.75 GB
Max memory reserved: 47.77 GB
Batch size: 2, Context length: 8


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


Model loaded
Max memory allocated: 47.80 GB
Max memory reserved: 47.83 GB
Batch size: 2, Context length: 16


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]


Model loaded
Max memory allocated: 48.01 GB
Max memory reserved: 48.05 GB
Batch size: 2, Context length: 32


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Max memory allocated: 48.43 GB
Max memory reserved: 48.44 GB
Batch size: 2, Context length: 64


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.19s/it]


Model loaded
Max memory allocated: 49.28 GB
Max memory reserved: 49.31 GB
Batch size: 2, Context length: 128


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Model loaded
Max memory allocated: 51.00 GB
Max memory reserved: 51.02 GB
Batch size: 2, Context length: 256


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.28s/it]


Model loaded
Max memory allocated: 54.34 GB
Max memory reserved: 54.39 GB
Batch size: 2, Context length: 512


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.33s/it]


Model loaded
Max memory allocated: 61.10 GB
Max memory reserved: 61.16 GB
Batch size: 2, Context length: 1024


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Model loaded
Max memory allocated: 74.61 GB
Max memory reserved: 74.74 GB
Batch size: 2, Context length: 2048


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]


Model loaded
Batch size: 2, Context length: 2048 failed
CUDA out of memory. Tried to allocate 20.00 MiB. GPU 
Batch size: 4, Context length: 1


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]


Model loaded
Max memory allocated: 47.64 GB
Max memory reserved: 47.66 GB
Batch size: 4, Context length: 2


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]


Model loaded
Max memory allocated: 47.69 GB
Max memory reserved: 47.71 GB
Batch size: 4, Context length: 4


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]


Model loaded
Max memory allocated: 47.80 GB
Max memory reserved: 47.83 GB
Batch size: 4, Context length: 6


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.52s/it]


Model loaded
Max memory allocated: 47.91 GB
Max memory reserved: 47.93 GB
Batch size: 4, Context length: 8


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Max memory allocated: 48.01 GB
Max memory reserved: 48.03 GB
Batch size: 4, Context length: 16


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.19s/it]


Model loaded
Max memory allocated: 48.43 GB
Max memory reserved: 48.44 GB
Batch size: 4, Context length: 32


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


Model loaded
Max memory allocated: 49.28 GB
Max memory reserved: 49.30 GB
Batch size: 4, Context length: 64


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Max memory allocated: 51.00 GB
Max memory reserved: 51.02 GB
Batch size: 4, Context length: 128


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]


Model loaded
Max memory allocated: 54.34 GB
Max memory reserved: 54.39 GB
Batch size: 4, Context length: 256


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Model loaded
Max memory allocated: 61.10 GB
Max memory reserved: 61.16 GB
Batch size: 4, Context length: 512


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Model loaded
Max memory allocated: 74.60 GB
Max memory reserved: 74.74 GB
Batch size: 4, Context length: 1024


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.25s/it]


Model loaded
Batch size: 4, Context length: 1024 failed
CUDA out of memory. Tried to allocate 20.00 MiB. GPU 
Batch size: 4, Context length: 2048


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.25s/it]


Model loaded
Batch size: 4, Context length: 2048 failed
CUDA out of memory. Tried to allocate 640.00 MiB. GPU 
Batch size: 6, Context length: 1


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Model loaded
Max memory allocated: 47.67 GB
Max memory reserved: 47.68 GB
Batch size: 6, Context length: 2


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.25s/it]


Model loaded
Max memory allocated: 47.74 GB
Max memory reserved: 47.77 GB
Batch size: 6, Context length: 4


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.23s/it]


Model loaded
Max memory allocated: 47.91 GB
Max memory reserved: 47.93 GB
Batch size: 6, Context length: 6


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]


Model loaded
Max memory allocated: 48.06 GB
Max memory reserved: 48.15 GB
Batch size: 6, Context length: 8


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.57s/it]


Model loaded
Max memory allocated: 48.22 GB
Max memory reserved: 48.28 GB
Batch size: 6, Context length: 16


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.40s/it]


Model loaded
Max memory allocated: 48.85 GB
Max memory reserved: 48.95 GB
Batch size: 6, Context length: 32


Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.91s/it]


Model loaded
Max memory allocated: 50.21 GB
Max memory reserved: 50.29 GB
Batch size: 6, Context length: 64


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.25s/it]


Model loaded
Max memory allocated: 52.65 GB
Max memory reserved: 53.32 GB
Batch size: 6, Context length: 128


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]


Model loaded
Max memory allocated: 57.98 GB
Max memory reserved: 58.02 GB
Batch size: 6, Context length: 256


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.26s/it]


Model loaded
Max memory allocated: 67.85 GB
Max memory reserved: 67.95 GB
Batch size: 6, Context length: 512


Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.83s/it]


Model loaded
Batch size: 6, Context length: 512 failed
CUDA out of memory. Tried to allocate 60.00 MiB. GPU 
Batch size: 6, Context length: 1024


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Batch size: 6, Context length: 1024 failed
CUDA out of memory. Tried to allocate 120.00 MiB. GPU 
Batch size: 6, Context length: 2048


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]


Model loaded
Batch size: 6, Context length: 2048 failed
CUDA out of memory. Tried to allocate 960.00 MiB. GPU 
Batch size: 8, Context length: 1


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Max memory allocated: 47.69 GB
Max memory reserved: 47.71 GB
Batch size: 8, Context length: 2


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.23s/it]


Model loaded
Max memory allocated: 47.80 GB
Max memory reserved: 47.83 GB
Batch size: 8, Context length: 4


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.28s/it]


Model loaded
Max memory allocated: 48.01 GB
Max memory reserved: 48.03 GB
Batch size: 8, Context length: 6


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]


Model loaded
Max memory allocated: 48.22 GB
Max memory reserved: 48.28 GB
Batch size: 8, Context length: 8


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]


Model loaded
Max memory allocated: 48.43 GB
Max memory reserved: 48.45 GB
Batch size: 8, Context length: 16


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Model loaded
Max memory allocated: 49.28 GB
Max memory reserved: 49.30 GB
Batch size: 8, Context length: 32


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.32s/it]


Model loaded
Max memory allocated: 51.00 GB
Max memory reserved: 51.02 GB
Batch size: 8, Context length: 64


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.31s/it]


Model loaded
Max memory allocated: 54.34 GB
Max memory reserved: 54.38 GB
Batch size: 8, Context length: 128


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]


Model loaded
Max memory allocated: 61.09 GB
Max memory reserved: 61.16 GB
Batch size: 8, Context length: 256


Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.75s/it]


Model loaded
Max memory allocated: 74.60 GB
Max memory reserved: 74.74 GB
Batch size: 8, Context length: 512


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.19s/it]


Model loaded
Batch size: 8, Context length: 512 failed
CUDA out of memory. Tried to allocate 20.00 MiB. GPU 
Batch size: 8, Context length: 1024


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.15s/it]


Model loaded
Batch size: 8, Context length: 1024 failed
CUDA out of memory. Tried to allocate 640.00 MiB. GPU 
Batch size: 8, Context length: 2048


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Batch size: 8, Context length: 2048 failed
CUDA out of memory. Tried to allocate 1.25 GiB. GPU 


In [50]:
measurements

{1: {1: (47.60166912, 47.620030464),
  2: (47.61424384, 47.632613376),
  4: (47.639765504, 47.6577792),
  6: (47.665762816, 47.682945024),
  8: (47.691723264, 47.714402304),
  16: (47.79668736, 47.82555136),
  32: (48.00386304, 48.033169408),
  64: (48.4203648, 48.43372544),
  128: (49.252058112, 49.280974848),
  256: (50.9524224, 50.97127936),
  512: (54.24244224, 54.282682368),
  1024: (60.89544192, 60.955820032),
  2048: (74.203132928, 74.337746944)},
 2: {1: (47.61442816, 47.632613376),
  2: (47.640153088, 47.659876352),
  4: (47.692884992, 47.714402304),
  6: (47.74565376, 47.766831104),
  8: (47.79943424, 47.82555136),
  16: (48.009780224, 48.054140928),
  32: (48.432462848, 48.4442112),
  64: (49.276813312, 49.306140672),
  128: (51.0036736, 51.021611008),
  256: (54.343287808, 54.385442816),
  512: (61.098349568, 61.159243776),
  1024: (74.608948224, 74.74249728),
  2048: (100, 100)},
 4: {1: (47.63994624, 47.6577792),
  2: (47.692442624, 47.714402304),
  4: (47.798955008, 47.8

In [51]:
# Dump measurements
import json

with open(f"Measurements/memory_measurements_forward_only_{MODEL}.json", 'w') as f:
    json.dump(measurements, f)