# Reference: 
- https://www.youtube.com/watch?v=gXDsVcY8TXQ&t=3146s
- https://github.com/TrelisResearch/install-guides/blob/main/multi-gpu/test_scripts/test_fsdp.py
- https://github.com/huggingface/trl/issues/1303

In [8]:
import os
os.environ["WANDB__SERVICE_WAIT"] = "300"
os.environ["HF_HOME"] = "/NS/llm-1/nobackup/afkhan/HF_CACHE/Misc"
os.environ["HF_DATASETS_CACHE"] = "/NS/llm-1/nobackup/afkhan/HF_CACHE/Datasets"
os.environ["TRANSFORMERS_CACHE"] = "/NS/llm-1/nobackup/afkhan/HF_CACHE/Models"

In [9]:
cache_dir = os.getenv("TRANSFORMERS_CACHE")

In [10]:
# !pip install transformers datasets bitsandbytes deepspeed accelerate

In [11]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from accelerate import PartialState, notebook_launcher

In [12]:
# !pip install wandb
import wandb
# from utils import print_trainable_parameters

In [13]:
def train_fsdp_trainer():

    DEVICE_MAP = 'FSDP'

    model_name = 'Llama-2-7b-hf'
    model_path = "/NS/llm-1/nobackup/vnanda/llm_base_models/Llama-2-7b-hf"

    model = AutoModelForCausalLM.from_pretrained(
        model_path, cache_dir=cache_dir
    )

    tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=cache_dir)
    tokenizer.pad_token = tokenizer.eos_token

    ds_name = "weyaxi--sci-datasets"
    ds = load_dataset("Weyaxi/sci-datasets", "alpaca")

    # Keep only 100 examples for now
    ds['train'] = ds['train'].select(range(100))

    def merge_columns(example):
        example['text'] = '### Instruction: ' + example['instruction'] + ' ### Answer: ' + example['output']
        return example

    ds['train'] = ds['train'].map(merge_columns)

    ds = ds.map(
        lambda samples: tokenizer(samples["text"]), batched=True,
    ) 

    ## Wandb Related

    WANDB_PROJECT = "FSDP-Analysis"
    WANDB_RUN_NAME = f"{model_name}-{ds_name}-full-finetune" + "-fsdp"

    ## Logging Related

    REPORT_TO = "wandb"
    OUTPUT_DIR = f"./output/{model_name}-{ds_name}-full-finetune" + "-fsdp"
    LOGGING_DIR = f"./logs/{model_name}-{ds_name}-full-finetune" + "-fsdp"
    LOGGING_STRATEGY = "steps"
    LOGGING_STEPS = 10

    ## Training Duration Related

    MAX_STEPS = 1000

    ## Optimizer Related

    LEARNING_RATE = 2e-4
    LR_SCHEDULER_TYPE = "linear"
    WARMUP_RATIO = 0.1

    ## Batch Related

    PER_DEVICE_TRAIN_BATCH_SIZE = 8
    PER_DEVICE_EVAL_BATCH_SIZE = 8

    ## Gradient Related (Also related to Parallelism)

    GRADIENT_CHECKPOINTING = True
    # Use reentrant starts a more efficient method of recomputing the graph from checkpoints
    USE_REENTRANT = True # Set False for DDP and True for Model/Pipeline Parallelism

    # Configure Wandb project and run

    wandb.init(project=WANDB_PROJECT, name=WANDB_RUN_NAME)

    training_args = TrainingArguments(
        # Logging Related
        report_to=REPORT_TO,
        output_dir = OUTPUT_DIR,
        logging_dir = LOGGING_DIR,
        logging_strategy = LOGGING_STRATEGY,
        logging_steps = LOGGING_STEPS,
        # Training Duration Related
        max_steps = MAX_STEPS,
        # Optimizer Related
        learning_rate = LEARNING_RATE,
        lr_scheduler_type = LR_SCHEDULER_TYPE,
        warmup_ratio = WARMUP_RATIO,
        # Batch Related
        per_device_train_batch_size = PER_DEVICE_TRAIN_BATCH_SIZE,
        per_device_eval_batch_size = PER_DEVICE_EVAL_BATCH_SIZE,
        # Gradient Related
        gradient_checkpointing = GRADIENT_CHECKPOINTING,
        gradient_checkpointing_kwargs = {"use_reentrant": USE_REENTRANT},
    )

    wandb.config.update(training_args)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
        train_dataset=ds['train'],
        eval_dataset=ds['train'],
    )

    # if getattr(trainer.accelerator.state, "fsdp_plugin", None):
    #     from peft.utils.other import fsdp_auto_wrap_policy

    #     fsdp_plugin = trainer.accelerator.state.fsdp_plugin
    #     fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(trainer.model)

    trainer.train()

    # Save Model
    model.save_pretrained(f'Saves/{model_name}-{ds_name}-full-finetune' + "-fsdp")

In [14]:
notebook_launcher(train_fsdp_trainer, args=(), num_processes=2)

Launching training on 2 GPUs.


Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.77s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.78s/it]
Map: 100%|██████████| 100/100 [00:01<00:00, 89.98 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 237.46 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 234.68 examples/s]
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maflah[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Currently logged in as: [33maflah[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[2024-07-16 14:37:23,906] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-07-16 14:37:25,038] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
W0716 14:37:37.204000 140358814619456 torch/multiprocessing/spawn.py:145] Terminating process 3788212 via signal SIGTERM
E0716 14:37:37.347000 140358814619456 torch/distributed/elastic/multiprocessing/api.py:695] failed (exitcode: 1) local_rank: 0 (pid: 3788210) of fn: train_fsdp_trainer (start_method: fork)
E0716 14:37:37.347000 140358814619456 torch/distributed/elastic/multiprocessing/api.py:695] Traceback (most recent call last):
E0716 14:37:37.347000 140358814619456 torch/distributed/elastic/multiprocessing/api.py:695]   File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 656, in _poll
E0716 14:37:37.347000 140358814619456 torch/distributed/elastic/multiprocessing/api.py:695]     self._pc.join(-1)
E0716 14:37:37.347000 

ChildFailedError: 
============================================================
train_fsdp_trainer FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-07-16_14:37:36
  host      : sws-2h100-02.mpi-sws.org
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 3788210)
  error_file: /tmp/torchelastic_qhh0qw_e/none_8bger7wv/attempt_0/0/error.json
  traceback : Traceback (most recent call last):
    File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
      return f(*args, **kwargs)
    File "/tmp/ipykernel_3787111/2977254477.py", line 106, in train_fsdp_trainer
      trainer.train()
    File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
      return inner_training_loop(
    File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/transformers/trainer.py", line 2330, in _inner_training_loop
      self.optimizer.step()
    File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/accelerate/optimizer.py", line 170, in step
      self.optimizer.step(closure)
    File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper
      return wrapped(*args, **kwargs)
    File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/torch/optim/optimizer.py", line 391, in wrapper
      out = func(*args, **kwargs)
    File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
      ret = func(self, *args, **kwargs)
    File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/torch/optim/adamw.py", line 177, in step
      has_complex = self._init_group(
    File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/torch/optim/adamw.py", line 124, in _init_group
      state["exp_avg"] = torch.zeros_like(
  torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 
  
============================================================

Traceback (most recent call last):
  File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "/N

: 