Ref: https://www.youtube.com/watch?v=-2ebSQROew4

In [1]:
import torch, gc, inspect, transformers
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from accelerate.utils import set_seed
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
transformers.logging.set_verbosity_warning()

In [3]:
device = 'cuda'

# Measuring Memory

In [4]:
def cleanup():
    "Free up memory and reset stats"
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats(device)
cleanup()

In [5]:
def print_memory_stats():
    """Print two different measures of GPU memory usage"""
    max_mem_allocated = torch.cuda.max_memory_allocated(device)/1e9
    max_mem_reserved = torch.cuda.max_memory_reserved(device)/1e9
    print(f"Max memory allocated: {max_mem_allocated:.2f} GB")
    # reserved (aka 'max_memory_cached') is ~the allocated memory plus pre-cached memory
    print(f"Max memory reserved: {max_mem_reserved:.2f} GB")

    return max_mem_allocated, max_mem_reserved

max_mem_allocated, max_mem_reserved = print_memory_stats()

Max memory allocated: 0.00 GB
Max memory reserved: 0.00 GB


In [6]:
# cleanup()

# model_path = '/NS/llm-1/nobackup/vnanda/llm_base_models/pythia-1b'

# model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

# print("Model loaded")

# batch_size = 1
# context_length = 1024

# data = torch.randint(0, 10000, (batch_size, context_length), device=device)

# output = model(data, labels=data)

# output.loss.backward()

# # Print memory stats
# print_memory_stats()

# # Cleanup
# del model, data, output
# cleanup()

In [7]:
model_path = '/NS/llm-1/nobackup/vnanda/llm_base_models/pythia-12b'
MODEL = model_path.split('/')[-1]
measurements = {}
BATCH_SIZES = [1,2,4,6,8]
CONTEXT_LENGTHS = [1,2,4,6,8,16,32,64,128,256,512,1024,2048]

In [8]:
import os

available_measurements = os.listdir('Measurements')

print(available_measurements)

['memory_measurements_pythia-1b.json', 'memory_measurements_pythia-70m.json', 'memory_measurements_pythia-6.9b.json']


In [9]:
cleanup()

In [11]:
if 'memory_measurements_' + MODEL + '.json' in available_measurements:
    
    with open('Measurements/memory_measurements_' + MODEL + '.json', 'r') as f:
        measurements = json.load(f)

    print("Loaded measurements")
    
    keys = list(measurements.keys())

    # check if all batch sizes are present
    for bs in BATCH_SIZES:
        if bs not in keys:
            print(f"Batch size {bs} not present")
            continue
        for cl in CONTEXT_LENGTHS:
            if cl not in measurements[bs]:
                print(f"Context length {cl} not present for batch size {bs}")
                continue

else:
    for batch_size in BATCH_SIZES:
        measurements[batch_size] = {} if batch_size not in measurements else measurements[batch_size]
        for context_length in CONTEXT_LENGTHS:

            if context_length in measurements[batch_size]:
                print(f"Batch size: {batch_size}, Context length: {context_length} already measured")
                print(f"Max Memory Allocated: {measurements[batch_size][context_length][0]} GB")
                print(f"Max Memory Reserved: {measurements[batch_size][context_length][1]} GB")
                continue
            
            try:
                print(f"Batch size: {batch_size}, Context length: {context_length}")
                
                cleanup()

                model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

                print("Model loaded")

                data = torch.randint(0, 10000, (batch_size, context_length), device=device)

                output = model(data, labels=data)

                output.loss.backward()

                # Print memory stats
                max_mem_allocated, max_mem_reserved = print_memory_stats()

                measurements[batch_size][context_length] = (max_mem_allocated, max_mem_reserved)

                # Cleanup
                del model, data, output
                cleanup()

            except Exception as e:
                print(f"Batch size: {batch_size}, Context length: {context_length} failed")
                print(e)
                measurements[batch_size][context_length] = (100, 100)
                try:
                    del model
                except:
                    pass
                try:
                    del data
                except:
                    pass
                try:
                    del output
                except:
                    pass
                cleanup()

Batch size: 1, Context length: 1 already measured
Max Memory Allocated: 100 GB
Max Memory Reserved: 100 GB
Batch size: 1, Context length: 2 already measured
Max Memory Allocated: 100 GB
Max Memory Reserved: 100 GB
Batch size: 1, Context length: 4


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.57s/it]


Batch size: 1, Context length: 4 failed
CUDA out of memory. Tried to allocate 990.00 MiB. GPU 
Batch size: 1, Context length: 6


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.23s/it]


Model loaded
Batch size: 1, Context length: 6 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 1, Context length: 8


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.42s/it]


Model loaded
Batch size: 1, Context length: 8 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 1, Context length: 16


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.25s/it]


Model loaded
Batch size: 1, Context length: 16 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 1, Context length: 32


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Batch size: 1, Context length: 32 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 1, Context length: 64


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.15s/it]


Model loaded
Batch size: 1, Context length: 64 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 1, Context length: 128


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]


Model loaded
Batch size: 1, Context length: 128 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 1, Context length: 256


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.36s/it]


Model loaded
Batch size: 1, Context length: 256 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 1, Context length: 512


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.18s/it]


Model loaded
Batch size: 1, Context length: 512 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 1, Context length: 1024


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it]


Model loaded
Batch size: 1, Context length: 1024 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 1, Context length: 2048


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.19s/it]


Model loaded
Batch size: 1, Context length: 2048 failed
CUDA out of memory. Tried to allocate 120.00 MiB. GPU 
Batch size: 2, Context length: 1


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.43s/it]


Model loaded
Batch size: 2, Context length: 1 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 2, Context length: 2


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.25s/it]


Model loaded
Batch size: 2, Context length: 2 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 2, Context length: 4


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.18s/it]


Model loaded
Batch size: 2, Context length: 4 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 2, Context length: 6


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.14s/it]


Model loaded
Batch size: 2, Context length: 6 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 2, Context length: 8


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.14s/it]


Model loaded
Batch size: 2, Context length: 8 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 2, Context length: 16


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.15s/it]


Model loaded
Batch size: 2, Context length: 16 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 2, Context length: 32


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.14s/it]


Model loaded
Batch size: 2, Context length: 32 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 2, Context length: 64


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.26s/it]


Model loaded
Batch size: 2, Context length: 64 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 2, Context length: 128


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.15s/it]


Model loaded
Batch size: 2, Context length: 128 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 2, Context length: 256


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.15s/it]


Model loaded
Batch size: 2, Context length: 256 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 2, Context length: 512


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.25s/it]


Model loaded
Batch size: 2, Context length: 512 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 2, Context length: 1024


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.18s/it]


Model loaded
Batch size: 2, Context length: 1024 failed
CUDA out of memory. Tried to allocate 120.00 MiB. GPU 
Batch size: 2, Context length: 2048


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.15s/it]


Model loaded
Batch size: 2, Context length: 2048 failed
CUDA out of memory. Tried to allocate 240.00 MiB. GPU 
Batch size: 4, Context length: 1


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Batch size: 4, Context length: 1 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 4, Context length: 2


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.32s/it]


Model loaded
Batch size: 4, Context length: 2 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 4, Context length: 4


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Model loaded
Batch size: 4, Context length: 4 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 4, Context length: 6


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.23s/it]


Model loaded
Batch size: 4, Context length: 6 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 4, Context length: 8


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.17s/it]


Model loaded
Batch size: 4, Context length: 8 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 4, Context length: 16


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.19s/it]


Model loaded
Batch size: 4, Context length: 16 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 4, Context length: 32


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.17s/it]


Model loaded
Batch size: 4, Context length: 32 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 4, Context length: 64


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.18s/it]


Model loaded
Batch size: 4, Context length: 64 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 4, Context length: 128


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.28s/it]


Model loaded
Batch size: 4, Context length: 128 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 4, Context length: 256


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.16s/it]


Model loaded
Batch size: 4, Context length: 256 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 4, Context length: 512


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.16s/it]


Model loaded
Batch size: 4, Context length: 512 failed
CUDA out of memory. Tried to allocate 120.00 MiB. GPU 
Batch size: 4, Context length: 1024


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.14s/it]


Model loaded
Batch size: 4, Context length: 1024 failed
CUDA out of memory. Tried to allocate 240.00 MiB. GPU 
Batch size: 4, Context length: 2048


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


Model loaded
Batch size: 4, Context length: 2048 failed
CUDA out of memory. Tried to allocate 640.00 MiB. GPU 
Batch size: 6, Context length: 1


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Model loaded
Batch size: 6, Context length: 1 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 6, Context length: 2


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.44s/it]


Model loaded
Batch size: 6, Context length: 2 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 6, Context length: 4


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.23s/it]


Model loaded
Batch size: 6, Context length: 4 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 6, Context length: 6


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.15s/it]


Model loaded
Batch size: 6, Context length: 6 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 6, Context length: 8


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.16s/it]


Model loaded
Batch size: 6, Context length: 8 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 6, Context length: 16


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Model loaded
Batch size: 6, Context length: 16 failed
CUDA out of memory. Tried to allocate 100.00 MiB. GPU 
Batch size: 6, Context length: 32


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.17s/it]


Model loaded
Batch size: 6, Context length: 32 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 6, Context length: 64


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]


Model loaded
Batch size: 6, Context length: 64 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 6, Context length: 128


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]


Model loaded
Batch size: 6, Context length: 128 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 6, Context length: 256


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.31s/it]


Model loaded
Batch size: 6, Context length: 256 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 6, Context length: 512


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


Model loaded
Batch size: 6, Context length: 512 failed
CUDA out of memory. Tried to allocate 240.00 MiB. GPU 
Batch size: 6, Context length: 1024


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Model loaded
Batch size: 6, Context length: 1024 failed
CUDA out of memory. Tried to allocate 120.00 MiB. GPU 
Batch size: 6, Context length: 2048


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.33s/it]


Model loaded
Batch size: 6, Context length: 2048 failed
CUDA out of memory. Tried to allocate 960.00 MiB. GPU 
Batch size: 8, Context length: 1


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.17s/it]


Model loaded
Batch size: 8, Context length: 1 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 8, Context length: 2


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.16s/it]


Model loaded
Batch size: 8, Context length: 2 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 8, Context length: 4


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Batch size: 8, Context length: 4 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 8, Context length: 6


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Batch size: 8, Context length: 6 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 8, Context length: 8


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


Model loaded
Batch size: 8, Context length: 8 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 8, Context length: 16


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.17s/it]


Model loaded
Batch size: 8, Context length: 16 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 8, Context length: 32


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Batch size: 8, Context length: 32 failed
CUDA out of memory. Tried to allocate 300.00 MiB. GPU 
Batch size: 8, Context length: 64


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.15s/it]


Model loaded
Batch size: 8, Context length: 64 failed
CUDA out of memory. Tried to allocate 22.00 MiB. GPU 
Batch size: 8, Context length: 128


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.16s/it]


Model loaded
Batch size: 8, Context length: 128 failed
CUDA out of memory. Tried to allocate 400.00 MiB. GPU 
Batch size: 8, Context length: 256


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.16s/it]


Model loaded
Batch size: 8, Context length: 256 failed
CUDA out of memory. Tried to allocate 120.00 MiB. GPU 
Batch size: 8, Context length: 512


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Batch size: 8, Context length: 512 failed
CUDA out of memory. Tried to allocate 240.00 MiB. GPU 
Batch size: 8, Context length: 1024


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Model loaded
Batch size: 8, Context length: 1024 failed
CUDA out of memory. Tried to allocate 640.00 MiB. GPU 
Batch size: 8, Context length: 2048


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Model loaded
Batch size: 8, Context length: 2048 failed
CUDA out of memory. Tried to allocate 1.25 GiB. GPU 


In [12]:
measurements

{1: {1: (100, 100),
  2: (100, 100),
  4: (100, 100),
  6: (100, 100),
  8: (100, 100),
  16: (100, 100),
  32: (100, 100),
  64: (100, 100),
  128: (100, 100),
  256: (100, 100),
  512: (100, 100),
  1024: (100, 100),
  2048: (100, 100)},
 2: {1: (100, 100),
  2: (100, 100),
  4: (100, 100),
  6: (100, 100),
  8: (100, 100),
  16: (100, 100),
  32: (100, 100),
  64: (100, 100),
  128: (100, 100),
  256: (100, 100),
  512: (100, 100),
  1024: (100, 100),
  2048: (100, 100)},
 4: {1: (100, 100),
  2: (100, 100),
  4: (100, 100),
  6: (100, 100),
  8: (100, 100),
  16: (100, 100),
  32: (100, 100),
  64: (100, 100),
  128: (100, 100),
  256: (100, 100),
  512: (100, 100),
  1024: (100, 100),
  2048: (100, 100)},
 6: {1: (100, 100),
  2: (100, 100),
  4: (100, 100),
  6: (100, 100),
  8: (100, 100),
  16: (100, 100),
  32: (100, 100),
  64: (100, 100),
  128: (100, 100),
  256: (100, 100),
  512: (100, 100),
  1024: (100, 100),
  2048: (100, 100)},
 8: {1: (100, 100),
  2: (100, 100),
  4

In [13]:
# Dump measurements
import json

with open(f"Measurements/memory_measurements_{MODEL}.json", 'w') as f:
    json.dump(measurements, f)