<a href="https://www.kaggle.com/code/liuserr/notebook2313628fdd?scriptVersionId=223484009" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Task B: FSDP2 with QLoRA
This notebook demonstrates a functional application of QLoRA using FSDP2. The acceleration is moderate, due to large overhead. Here are the three loss curves generated. 1 is using FSDP1 on a single GPU. 2 is using FSDP2 without gradient accumulation, making it more susceptible to memory overload. 3 is using FSDP2 with gradient accumulation. As you can see, the loss curves are the same for all configurations.

*FSDP1*

![FSDP1](loss_plot.png "FSDP1") 

*FSDP2 No Gradient Accum*

![alt text](loss_plot_v2.png "FSDP2 No Gradient Accum") 

*FSDP2 Gradient Accum*

![alt text](loss_plot_gradient_accum.png "FSDP2 Gradient Accum")

In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [None]:
!pip install -U torch accelerate transformers bitsandbytes peft

In [None]:
!pip install --upgrade torchvision torch

In [None]:
!pip install --upgrade --force-reinstall "numpy<2.0"

In [None]:
!pip install importlib-metadata

# Single GPU FSDP1

![image](loss_plot.png)

In [7]:
%%writefile single_gpu.py
import os
import random
import numpy as np
import torch

seed = 3407
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# --- Environment Setup ---
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

import torch
import torch.distributed as dist
from torch.distributed.fsdp import (
    FullyShardedDataParallel as FSDP,
    CPUOffload,
    ShardingStrategy,
    MixedPrecision,
)
from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from accelerate import Accelerator
from functools import partial

# --- 1) Distributed Setup ---
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
    dist.init_process_group(backend="nccl")
    local_rank = int(os.environ["LOCAL_RANK"])
    torch.cuda.set_device(local_rank)
else:
    local_rank = 0

accelerator = Accelerator(mixed_precision="bf16")
device = torch.device("cuda", local_rank)

# --- 2) Model & 4-bit Quantization ---
model_name = "unsloth/meta-Llama-3.1-8B-Instruct-bnb-4bit"
dtype = torch.bfloat16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=dtype,
    bnb_4bit_quant_storage=dtype,  # try storing quantized weights as BF16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=dtype,
    trust_remote_code=True,
)
model = prepare_model_for_kbit_training(model)

# --- 3) Apply LoRA ---
lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

for name, param in model.named_parameters():
    if ".lora_" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

# --- Integrate torch.compile (PyTorch 2.0 feature) ---
try:
    model = torch.compile(model)
    accelerator.print("torch.compile: Model compiled successfully.")
except Exception as e:
    accelerator.print(f"torch.compile: Compilation failed with error: {e}. Continuing without compilation.")

torch.cuda.empty_cache()

# --- Debug: Print buffer names and dtypes ---
if local_rank == 0:
    buffer_info = [(name, buf.dtype) for name, buf in model.named_buffers()]
    print("Buffers in model (name, dtype):")
    for name, dt in buffer_info:
        print(name, dt)

# --- 5) FSDP Wrapping with FSDP1 ---
# Ignore the frozen base module from FSDP sharding.
base_model_submodule = model.model  
ignored_modules = [base_model_submodule]

auto_wrap_policy = partial(
    size_based_auto_wrap_policy,
    min_num_params=5e7,
    recurse=True,
    nonwrapped_numel=0,
)

model = FSDP(
    model,
    auto_wrap_policy=auto_wrap_policy,
    sharding_strategy=ShardingStrategy.FULL_SHARD,  # Ensure full sharding (FSDP1)
    use_orig_params=True,
    mixed_precision=MixedPrecision(param_dtype=dtype),
    ignored_modules=ignored_modules,
    device_id=device,
)

# --- 6) Tokenizer & Dataset ---
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "</s>"

dataset = load_dataset(
    "json",
    data_files={"train": "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"},
    split="train[:10%]"
)

def tokenize_example(ex):
    enc = tokenizer(
        ex["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
    )
    enc["labels"] = enc["input_ids"].copy()
    return enc

dataset = dataset.map(tokenize_example, batched=True, remove_columns=["text"])
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
train_dataloader = accelerator.prepare(train_dataloader)

# --- 7) Optimizer ---
trainable_params = [p for p in model.parameters() if p.requires_grad]
try:
    from bitsandbytes.optim import Adam8bit
    optimizer = Adam8bit(trainable_params, lr=2e-4)
    accelerator.print("Using bitsandbytes Adam8bit optimizer.")
except ImportError:
    optimizer = torch.optim.AdamW(trainable_params, lr=2e-4)
    accelerator.print("Using torch.optim.AdamW optimizer.")
optimizer = accelerator.prepare(optimizer)

# --- 8) Training Loop with Gradient Accumulation ---
loss_history = []
model.train()
gradient_accumulation_steps = 16  # Adjust this value as needed

for step, batch in enumerate(train_dataloader, start=1):
    # The accelerator.accumulate context will accumulate gradients over the specified steps.
    with accelerator.accumulate(model):
        outputs = model(**batch)
        # Scale loss to account for gradient accumulation
        loss = outputs.loss / gradient_accumulation_steps
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
        # Clear cache after the optimizer step to lower VRAM usage
        if accelerator.sync_gradients:
            torch.cuda.empty_cache()
    
    if accelerator.is_main_process:
        loss_history.append((step, outputs.loss.item()))
        accelerator.print(f"Step {step} - Loss: {outputs.loss.item():.4f}")
    
    # Terminate after 60 steps (micro-batches)
    if step >= 60:
        break
accelerator.wait_for_everyone()

if accelerator.is_main_process:
    try:
        import matplotlib.pyplot as plt
        steps, losses = zip(*loss_history) if loss_history else ([], [])
        if steps:
            plt.figure(figsize=(8, 4))
            plt.plot(steps, losses, marker="o")
            plt.xlabel("Training Step")
            plt.ylabel("Loss")
            plt.title("Loss over the First 60 Training Steps")
            plt.grid(True)
            plt.savefig("loss_plot.png")
            plt.show()
        else:
            print("No loss data recorded.")
    except ImportError:
        accelerator.print("matplotlib is not installed. Skipping loss plot.")

# --- 9) Save Model ---
if accelerator.is_main_process:
    final_model = accelerator.unwrap_model(model)
    final_model.save_pretrained("llama-8b-finetuned", safe_serialization=True)
    tokenizer.save_pretrained("llama-8b-finetuned")

if dist.is_initialized():
    dist.destroy_process_group()


Overwriting single_gpu.py


In [9]:
!torchrun --nproc_per_node=2 --nnodes=1 single_gpu.py

W0219 22:05:47.889000 134 torch/distributed/run.py:792] 
W0219 22:05:47.889000 134 torch/distributed/run.py:792] *****************************************
W0219 22:05:47.889000 134 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0219 22:05:47.889000 134 torch/distributed/run.py:792] *****************************************
2025-02-19 22:05:56.279975: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-19 22:05:56.279957: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-19 22:05:56.469977: E external/local_xla/xla/stream_e

# FSDP2 without Gradient Accumulation
![image](loss_plot_v2.png)

In [10]:
%%writefile finetune_llama31_8b_fsdp2_qlora_revised_v2.py
import os

import random
import numpy as np
import torch

seed = 3407
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# Optionally, enforce determinism (may slow down training):
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False


# --- Environment Setup ---
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

import torch
import torch.distributed as dist
from torch.distributed.fsdp import (
    FullyShardedDataParallel as FSDP,
    CPUOffload,
    ShardingStrategy,
    MixedPrecision,
)
from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from accelerate import Accelerator
from functools import partial

# --- 1) Distributed Setup ---
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
    dist.init_process_group(backend="nccl")
    local_rank = int(os.environ["LOCAL_RANK"])
    torch.cuda.set_device(local_rank)
else:
    local_rank = 0

accelerator = Accelerator(mixed_precision="bf16")
device = torch.device("cuda", local_rank)

# --- 2) Model & 4-bit Quantization ---
model_name = "unsloth/meta-Llama-3.1-8B-Instruct-bnb-4bit"
dtype = torch.bfloat16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=dtype,
    bnb_4bit_quant_storage=dtype,  # try storing quantized weights as BF16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=dtype,
    trust_remote_code=True,
)
model = prepare_model_for_kbit_training(model)

# --- 3) Apply LoRA ---
lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

for name, param in model.named_parameters():
    if ".lora_" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

# --- Integrate torch.compile (PyTorch 2.0 feature) ---
try:
    model = torch.compile(model)
    accelerator.print("torch.compile: Model compiled successfully.")
except Exception as e:
    accelerator.print(f"torch.compile: Compilation failed with error: {e}. Continuing without compilation.")

# --- Optionally, we previously converted int parameters to buffers.
# For this version, we skip that conversion and rely on bitsandbytes to manage quantized weights.
# def convert_int_params_to_buffers(module: torch.nn.Module):
#     for name, param in list(module.named_parameters(recurse=False)):
#         if param is not None and not param.dtype.is_floating_point:
#             del module._parameters[name]
#             module.register_buffer(name, param.data)
#     for child in module.children():
#         convert_int_params_to_buffers(child)
#
# convert_int_params_to_buffers(model)

torch.cuda.empty_cache()

# --- Debug: Print buffer names and dtypes ---
if local_rank == 0:
    buffer_info = [(name, buf.dtype) for name, buf in model.named_buffers()]
    print("Buffers in model (name, dtype):")
    for name, dt in buffer_info:
        print(name, dt)

# --- 5) FSDP Wrapping ---
# Ignore the frozen base module from FSDP sharding.
base_model_submodule = model.model  
ignored_modules = [base_model_submodule]

auto_wrap_policy = partial(
    size_based_auto_wrap_policy,
    min_num_params=5e7,
    recurse=True,
    nonwrapped_numel=0,
)

mp_policy = MixedPrecision(
    param_dtype=dtype,      # cast parameters to BF16
    reduce_dtype=dtype,     # use BF16 for gradient reduction
    buffer_dtype=None,      # do not cast buffers
)

cpu_offload = None
# Uncomment the next line if needed:
# cpu_offload = CPUOffload(offload_params=True)

model = FSDP(
    model,
    auto_wrap_policy=auto_wrap_policy,
    sharding_strategy=ShardingStrategy.FULL_SHARD,
    use_orig_params=True,
    cpu_offload=cpu_offload,
    limit_all_gathers=True,
    device_id=device,
    ignored_modules=ignored_modules,
    mixed_precision=mp_policy,
)

# --- 6) Tokenizer & Dataset ---
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "</s>"

dataset = load_dataset(
    "json",
    data_files={"train": "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"},
    split="train[:10%]"
)

def tokenize_example(ex):
    enc = tokenizer(
        ex["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
    )
    enc["labels"] = enc["input_ids"].copy()
    return enc

dataset = dataset.map(tokenize_example, batched=True, remove_columns=["text"])
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
train_dataloader = accelerator.prepare(train_dataloader)

# --- 7) Optimizer ---
trainable_params = [p for p in model.parameters() if p.requires_grad]
try:
    from bitsandbytes.optim import Adam8bit
    optimizer = Adam8bit(trainable_params, lr=2e-4)
    accelerator.print("Using bitsandbytes Adam8bit optimizer.")
except ImportError:
    optimizer = torch.optim.AdamW(trainable_params, lr=2e-4)
    accelerator.print("Using torch.optim.AdamW optimizer.")
optimizer = accelerator.prepare(optimizer)

# --- 8) Training Loop ---
loss_history = []
model.train()
for step, batch in enumerate(train_dataloader, start=1):
    outputs = model(**batch)
    loss = outputs.loss
    accelerator.backward(loss)
    optimizer.step()
    optimizer.zero_grad()
    torch.cuda.empty_cache()
    if accelerator.is_main_process:
        loss_history.append((step, loss.item()))
    
    accelerator.print(f"Step {step} - Loss: {loss.item():.4f}")
    
    # Terminate after 60 steps.
    if step >= 60:
        break
accelerator.wait_for_everyone()

if accelerator.is_main_process:
    try:
        import matplotlib.pyplot as plt
        steps, losses = zip(*loss_history) if loss_history else ([], [])
        if steps:
            plt.figure(figsize=(8, 4))
            plt.plot(steps, losses, marker="o")
            plt.xlabel("Training Step")
            plt.ylabel("Loss")
            plt.title("Loss over the First 60 Training Steps")
            plt.grid(True)
            plt.savefig("loss_plot_v2.png")
            plt.show()
        else:
            print("No loss data recorded.")
    except ImportError:
        accelerator.print("matplotlib is not installed. Skipping loss plot.")


# --- 9) Save Model ---
if accelerator.is_main_process:
    final_model = accelerator.unwrap_model(model)
    final_model.save_pretrained("llama-8b-finetuned", safe_serialization=True)
    tokenizer.save_pretrained("llama-8b-finetuned")

if dist.is_initialized():
    dist.destroy_process_group()


Writing finetune_llama31_8b_fsdp2_qlora_revised_v2.py


In [11]:
!accelerate launch --num_processes=2 --mixed_precision=bf16 finetune_llama31_8b_fsdp2_qlora_revised_v2.py

2025-02-19 22:16:34.708720: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-19 22:16:34.708846: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-19 22:16:34.731531: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-19 22:16:34.731603: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-19 22:16:34.738261: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory

# FSDP2 with Gradient Accumulation
![image](loss_plot_gradient_accum.png)

In [12]:
%%writefile gradient_accumulation.py
import os

import random
import numpy as np
import torch

seed = 3407
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# Optionally, enforce determinism (may slow down training):
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False


# --- Environment Setup ---
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

import torch
import torch.distributed as dist
from torch.distributed.fsdp import (
    FullyShardedDataParallel as FSDP,
    CPUOffload,
    ShardingStrategy,
    MixedPrecision,
)
from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from accelerate import Accelerator
from functools import partial

# --- 1) Distributed Setup ---
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
    dist.init_process_group(backend="nccl")
    local_rank = int(os.environ["LOCAL_RANK"])
    torch.cuda.set_device(local_rank)
else:
    local_rank = 0

accelerator = Accelerator(mixed_precision="bf16")
device = torch.device("cuda", local_rank)

# --- 2) Model & 4-bit Quantization ---
model_name = "unsloth/meta-Llama-3.1-8B-Instruct-bnb-4bit"
dtype = torch.bfloat16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=dtype,
    bnb_4bit_quant_storage=dtype,  # try storing quantized weights as BF16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=dtype,
    trust_remote_code=True,
)
model = prepare_model_for_kbit_training(model)

# --- 3) Apply LoRA ---
lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

for name, param in model.named_parameters():
    if ".lora_" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

# --- Integrate torch.compile (PyTorch 2.0 feature) ---
try:
    model = torch.compile(model)
    accelerator.print("torch.compile: Model compiled successfully.")
except Exception as e:
    accelerator.print(f"torch.compile: Compilation failed with error: {e}. Continuing without compilation.")

# --- Optionally, we previously converted int parameters to buffers.
# For this version, we skip that conversion and rely on bitsandbytes to manage quantized weights.
# def convert_int_params_to_buffers(module: torch.nn.Module):
#     for name, param in list(module.named_parameters(recurse=False)):
#         if param is not None and not param.dtype.is_floating_point:
#             del module._parameters[name]
#             module.register_buffer(name, param.data)
#     for child in module.children():
#         convert_int_params_to_buffers(child)
#
# convert_int_params_to_buffers(model)

torch.cuda.empty_cache()

# --- Debug: Print buffer names and dtypes ---
if local_rank == 0:
    buffer_info = [(name, buf.dtype) for name, buf in model.named_buffers()]
    print("Buffers in model (name, dtype):")
    for name, dt in buffer_info:
        print(name, dt)

# --- 5) FSDP Wrapping ---
# Ignore the frozen base module from FSDP sharding.
base_model_submodule = model.model  
ignored_modules = [base_model_submodule]

auto_wrap_policy = partial(
    size_based_auto_wrap_policy,
    min_num_params=5e7,
    recurse=True,
    nonwrapped_numel=0,
)

mp_policy = MixedPrecision(
    param_dtype=dtype,      # cast parameters to BF16
    reduce_dtype=dtype,     # use BF16 for gradient reduction
    buffer_dtype=None,      # do not cast buffers
)

cpu_offload = None
# Uncomment the next line if needed:
# cpu_offload = CPUOffload(offload_params=True)

model = FSDP(
    model,
    auto_wrap_policy=auto_wrap_policy,
    sharding_strategy=ShardingStrategy.FULL_SHARD,
    use_orig_params=True,
    cpu_offload=cpu_offload,
    limit_all_gathers=True,
    device_id=device,
    ignored_modules=ignored_modules,
    mixed_precision=mp_policy,
)

# --- 6) Tokenizer & Dataset ---
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "</s>"

dataset = load_dataset(
    "json",
    data_files={"train": "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"},
    split="train[:10%]"
)

def tokenize_example(ex):
    enc = tokenizer(
        ex["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
    )
    enc["labels"] = enc["input_ids"].copy()
    return enc

dataset = dataset.map(tokenize_example, batched=True, remove_columns=["text"])
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
train_dataloader = accelerator.prepare(train_dataloader)

# --- 7) Optimizer ---
trainable_params = [p for p in model.parameters() if p.requires_grad]
try:
    from bitsandbytes.optim import Adam8bit
    optimizer = Adam8bit(trainable_params, lr=2e-4)
    accelerator.print("Using bitsandbytes Adam8bit optimizer.")
except ImportError:
    optimizer = torch.optim.AdamW(trainable_params, lr=2e-4)
    accelerator.print("Using torch.optim.AdamW optimizer.")
optimizer = accelerator.prepare(optimizer)

# --- 8) Training Loop with Gradient Accumulation ---
loss_history = []
model.train()
gradient_accumulation_steps = 16  # Adjust this value as needed

for step, batch in enumerate(train_dataloader, start=1):
    # The accelerator.accumulate context will accumulate gradients over the specified steps.
    with accelerator.accumulate(model):
        outputs = model(**batch)
        # Scale loss to account for gradient accumulation
        loss = outputs.loss / gradient_accumulation_steps
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
        # Clear cache after the optimizer step to lower VRAM usage
        if accelerator.sync_gradients:
            torch.cuda.empty_cache()
    
    if accelerator.is_main_process:
        loss_history.append((step, outputs.loss.item()))
        accelerator.print(f"Step {step} - Loss: {outputs.loss.item():.4f}")
    
    # Terminate after 60 steps (micro-batches)
    if step >= 60:
        break
accelerator.wait_for_everyone()

if accelerator.is_main_process:
    try:
        import matplotlib.pyplot as plt
        steps, losses = zip(*loss_history) if loss_history else ([], [])
        if steps:
            plt.figure(figsize=(8, 4))
            plt.plot(steps, losses, marker="o")
            plt.xlabel("Training Step")
            plt.ylabel("Loss")
            plt.title("Loss over the First 60 Training Steps")
            plt.grid(True)
            plt.savefig("loss_plot_gradient_accum.png")
            plt.show()
        else:
            print("No loss data recorded.")
    except ImportError:
        accelerator.print("matplotlib is not installed. Skipping loss plot.")


# --- 9) Save Model ---
if accelerator.is_main_process:
    final_model = accelerator.unwrap_model(model)
    final_model.save_pretrained("llama-8b-finetuned", safe_serialization=True)
    tokenizer.save_pretrained("llama-8b-finetuned")

if dist.is_initialized():
    dist.destroy_process_group()


Writing gradient_accumulation.py


In [13]:
!accelerate launch --num_processes=2 --mixed_precision=bf16 gradient_accumulation.py

2025-02-19 22:31:49.962854: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-19 22:31:49.975927: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-19 22:31:49.985704: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-19 22:31:49.992782: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-19 22:31:49.997723: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factor