<a href="https://colab.research.google.com/github/amanzoni1/qLoRA-FSDP/blob/main/qLoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Flash Attention 2
# shard, quantize, checkpoint, or otherwise reduce the footprint.
# mixed‑precision, ZeRO/FSDP


# from PEFT
# torch.compile
# In PEFT, torch.compile works for some but not all features. The reason why it
# won’t always work is because PEFT is highly dynamic in certain places
#  (loading and switching between multiple adapters, for instance),
#  which can cause trouble for torch.compile. In other places, torch.compile may work,
#  but won’t be as fast as expected because of graph breaks.

# check quantization and FSDP in PEFT docs



In [None]:
# 1. 📦 Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import getpass
import os

# Prompt for the token
hf_token = getpass.getpass('Enter your HF access token and press enter: ')

# Set the environment variable
os.environ['HF_TOKEN'] = hf_token

print("HF_TOKEN environment variable has been set.")

Enter your HF access token and press enter: ··········
HF_TOKEN environment variable has been set.


In [2]:
# install core libs
!pip install transformers datasets tokenizers accelerate bitsandbytes peft

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_c

In [None]:
# GPU status & VRAM
!nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv

name, memory.total [MiB], driver_version
Tesla T4, 15360 MiB, 550.54.15


In [None]:
import torch

# Device info
print("CUDA available? ", torch.cuda.is_available())
if torch.cuda.is_available():
    dev = torch.cuda.current_device()
    props = torch.cuda.get_device_properties(dev)
    print(f" Device: {props.name}")
    print(f" Total VRAM: {props.total_memory // 2**20} MiB")
else:
    print("Running on CPU—profiling memory won’t work here, but you can still inspect model structure.")

CUDA available?  True
 Device: Tesla T4
 Total VRAM: 15095 MiB


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
import torch

model_name = "meta-llama/Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model     = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)

env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]



In [None]:
cfg = model.config
print(cfg)
print(cfg.to_dict())   # ??
print(dir(cfg))        # ??

LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.51.3",
  "use_cache": true,
  "vocab_size": 128256
}

{'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 4096, 'intermediate_size': 14336, '

In [4]:
from typing import List, Dict
from transformers import AutoConfig

def estimate_training_vram(
    model_name: str,
    batch_size: int = 4,
    seq_len: int = 1024,
    *,
    dtype_weight: str = "int4",
    dtype_grad: str = "bf16",
    dtype_master: str = "fp32",
    dtype_activation: str = "fp16",
    use_lora: bool = False,
    lora_r: int = 8,
    target_modules: List[str] = ("q_proj", "k_proj"),
    pessimistic: bool = True,
) -> Dict[str, float]:
    """
    Analytic VRAM estimator for HF causal‑LMs.
    Returns a dict with GB numbers; also prints a summary.
    """

    cfg = AutoConfig.from_pretrained(model_name)

    # ---------- core dims ----------
    V = cfg.vocab_size
    D = getattr(cfg, "hidden_size", None) or getattr(cfg, "n_embd")
    L = getattr(cfg, "num_hidden_layers", None) or getattr(cfg, "n_layer")
    F = getattr(cfg, "intermediate_size", None) or getattr(cfg, "n_inner", 4 * D)

    # ---------- attention ----------
    h  = cfg.num_attention_heads
    kv = getattr(cfg, "num_key_value_heads", h)
    D_kv = int(D * kv / h)
    att_params = D*D + 2*D*D_kv + D*D

    # ---------- MLP ----------
    triple = str(cfg.hidden_act).lower() in {"silu", "swiglu", "gelu_new", "glu"}
    mlp_mult = 3 if triple else 2
    mlp_params = mlp_mult * D * F
    P_layer = att_params + mlp_params

    # ---------- embeddings ----------
    P_embed = V * D
    P_total = 2 * P_embed + L * P_layer     # untied head

    # ---------- LoRA ----------
    if use_lora:
        per_layer = 0
        for name in target_modules:
            # set (in_dim, out_dim) for each projection type
            if name in {"q_proj", "o_proj"}:
                in_dim, out_dim = D, D
            elif name in {"k_proj", "v_proj"}:
                in_dim, out_dim = D, D_kv
            elif name in {"gate_proj", "up_proj"}:
                in_dim, out_dim = D, F
            elif name == "down_proj":
                in_dim, out_dim = F, D
            else:
                raise ValueError(f"Unknown target module: {name}")

            # LoRA adds A (r×in) + B (out×r) = r·(in + out)
            per_layer += lora_r * (in_dim + out_dim)

        P_lora = per_layer * L            # total across all layers
    else:
        P_lora = 0

    P_train = P_lora if use_lora else P_total

    # ---------- bytes ----------
    base = {"fp32": 4, "fp16": 2, "bf16": 2, "int8": 1, "int4": 0.75 if pessimistic else 0.5}
    b_w, b_g, b_m, b_a = base[dtype_weight], base[dtype_grad], base[dtype_master], base[dtype_activation]

    weight_B = P_total * b_w * 1.01
    grad_B   = P_train * b_g
    adam_B   = P_train * 2 * b_m
    master_B = P_train * b_m
    static_B = weight_B + grad_B + adam_B + master_B

    activ_B  = 2 * L * batch_size * seq_len * D * b_a
    scratch_B = 0.15 * activ_B
    total_B  = 1.10 * (static_B + activ_B + scratch_B)

    GB = 1024 ** 3
    print(f"\nModel  : {model_name}")
    print(f"Params : {P_total/1e9:.2f} B")
    if use_lora:
        print(f"LoRA   : {P_lora/1e6:.1f} M ({100*P_lora/P_total:.2f} %)  r={lora_r}")
    print(f"B={batch_size}, S={seq_len}")
    print(f"  Base weights     : {weight_B/GB:5.2f} GB")
    print(f"  Grad+Adam+master : {(grad_B+adam_B+master_B)/GB:5.2f} GB")
    print(f"  Activations      : {activ_B/GB:5.2f} GB")
    print(f"  Scratch          : {scratch_B/GB:5.2f} GB")
    print(f"≈ Estimated VRAM   : {total_B/GB:5.2f} GB\n")

    return

In [15]:
estimate_training_vram(
    "meta-llama/Llama-3.1-8B",
    batch_size=4,
    seq_len=2048,
    dtype_weight="int4",
    dtype_grad="bf16",
    dtype_master="fp32",
    dtype_activation="fp16",
    use_lora=True,
    lora_r=16,
    target_modules=["q_proj", "k_proj"],
)


Model  : meta-llama/Llama-3.1-8B
Params : 8.03 B
LoRA   : 6.8 M (0.08 %)  r=16
B=4, S=2048
  Base weights     :  5.66 GB
  Grad+Adam+master :  0.09 GB
  Activations      :  4.00 GB
  Scratch          :  0.60 GB
≈ Estimated VRAM   : 11.39 GB



In [13]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=128,
    target_modules = ["q_proj", "k_proj"],
    lora_dropout = 0,
    bias = "none",
  )

In [14]:
from peft import get_peft_model

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()



trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.0848


In [1]:
# ----- remove the adapter permanently -----
import gc
# model = model.merge_and_unload()
# model.disable_adapter()
model.unload()
gc.collect(); torch.cuda.empty_cache()

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=128,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout = 0,
    bias = "none",
  )

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

NameError: name 'model' is not defined

In [None]:
trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465

In [None]:
!pip install -q torchinfo torchview

In [None]:
from torchinfo import summary
summary(
    model,
    depth=2,
    input_size=(1, 2048),
    col_names=("output_size","num_params")
)



RuntimeError: Failed to run torchinfo. See above stack traces for more details. Executed layers up to: []

In [None]:
from torchview import draw_graph
graph = draw_graph(
    model,
    input_size=(1, 2048),      # batch, seq_len
    graph_name="Llama-3.1-8B",
    save_graph=False
)
graph.visual_graph



RuntimeError: Failed to run torchgraph see error message

In [None]:
dummy = tokenizer("Hello world! " * 64, return_tensors="pt")
dummy = {k: v.to(model.device) for k, v in dummy.items()}

In [None]:
# 3. Profile one forward pass with torch.profiler
from torch.profiler import profile, record_function,

# Reset CUDA counters (optional)
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True,
) as prof:
    with record_function("training_step"):
        optimizer.zero_grad()
        outputs = model(**dummy)           # forward
        loss = outputs.loss if hasattr(outputs, "loss") else outputs.logits.sum()
        loss.backward()                    # backward
        optimizer.step()

# Show top 10 ops by CUDA memory usage
print(prof.key_averages()
      .table(sort_by="cuda_memory_usage", row_limit=10))


# PROB IMPROVED
# from torch.profiler import profile, record_function, ProfilerActivity

# with profile(
#     activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
#     record_shapes=True,
#     profile_memory=True,
# ) as prof:
#     with record_function("training_step"):
#         _ = model_fp16(**dummy)

# print(prof.key_averages().table(
#       sort_by="cuda_memory_usage",
#       row_limit=10,
#       col_names=["name","input_shapes","self_cuda_memory_usage"]))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference         9.00%      71.942ms       100.00%     798.963ms     798.963ms       0.000us         0.00%     103.958ms     103.958ms       1.88 Kb       1.12 Kb     908.48 Mb    -511.31 M