In [1]:
import os
os.environ['HF_TOKEN'] = "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" # read token
os.environ['CUDA_VISIBLE_DEVICES'] = "1,2,3"
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# which model to ft
model_id = "tiiuae/falcon-7b" # "meta-llama/Llama-2-7b-hf"
                                      # "meta-llama/Llama-2-13b-hf"
                                      # "openai-community/gpt2"
                                      # "mistralai/Mistral-7B-v0.1"
                                      # "google/gemma-7b"
                                      # "tiiuae/falcon-7b"

from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [3]:
from transformers import AutoTokenizer, \
                         AutoModelForCausalLM

if model_id == "meta-llama/Llama-2-7b-hf" or model_id == "meta-llama/Llama-2-13b-hf" :
    from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaForSequenceClassification
    tokenizer = LlamaTokenizer.from_pretrained(model_id)
    model = LlamaForCausalLM.from_pretrained(model_id,
                                                quantization_config=bnb_config,)
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
elif model_id == "google/gemma-7b":
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id,
                                                    quantization_config=bnb_config)
elif model_id == "mistralai/Mistral-7B-v0.1":
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id,
                                                    quantization_config=bnb_config)
elif model_id == "tiiuae/falcon-7b":
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id,
                                                 quantization_config=bnb_config)
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
elif model_id == "openai-community/gpt2":
    from transformers import GPT2Tokenizer, GPT2Model
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    model = GPT2Model.from_pretrained('gpt2')
else:
    model = None
    tokenizer = None


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# check lora trainable layers
def check_lora_trainable_layers():
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name, param.shape)

def check_lora_target_mods():
    for name in model.named_modules():
        print(name)

check_lora_target_mods()

('', FalconForCausalLM(
  (transformer): FalconModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (rotary_emb): FalconRotaryEmbedding()
          (query_key_value): Linear4bit(in_features=4544, out_features=4672, bias=False)
          (dense): Linear4bit(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): Linear4bit(in_features=4544, out_features=18176, bias=False)
          (act): GELU(approximate='none')
          (dense_4h_to_h): Linear4bit(in_features=18176, out_features=4544, bias=False)
        )
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=4544, out_features=65024, bias=False)
))
('transformer', 

In [5]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

target_modules = []
if (model_id == "meta-llama/Llama-2-7b-hf") or \
   (model_id == "google/gemma-7b"):
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
elif model_id == "mistralai/Mistral-7B-v0.1":
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "down_proj", "up_proj", "gate_proj"]
elif model_id == "tiiuae/falcon-7b":
    target_modules = ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]
elif model_id == "openai-community/gpt2":
    target_modules = ["c_proj"]

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=target_modules,
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    modules_to_save=['weight']
)

lora_model = get_peft_model(model, lora_config)

In [6]:
# get the data ready
from utils import *
data_ds = format_dat(tokenizer=tokenizer)

Map (num_proc=4):   0%|          | 0/12358 [00:00<?, ? examples/s]

In [7]:
# fine-tuning!
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

output_dirname = "saved_models/FT_" + model_id

training_args = TrainingArguments(
      per_device_train_batch_size=1,
      per_device_eval_batch_size=1,
      gradient_accumulation_steps=4, # this is for optimization
      evaluation_strategy='epoch',
      num_train_epochs=1,
      warmup_steps=2,
     # max_steps=1, # overrides num_train_epochs
      learning_rate=2e-4,
      fp16=True, # this is for optimization
      logging_steps=1,
      output_dir=output_dirname,
      optim="paged_adamw_8bit"
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=data_ds['train'],
    eval_dataset=data_ds['test'],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


OutOfMemoryError: Caught OutOfMemoryError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/peft/peft_model.py", line 1083, in forward
    return self.base_model(
           ^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/peft/tuners/tuners_utils.py", line 161, in forward
    return self.model.forward(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/transformers/models/falcon/modeling_falcon.py", line 1282, in forward
    transformer_outputs = self.transformer(
                          ^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/transformers/models/falcon/modeling_falcon.py", line 1149, in forward
    outputs = self._gradient_checkpointing_func(
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/_compile.py", line 24, in inner
    return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 489, in _fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/_dynamo/external_utils.py", line 17, in inner
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 482, in checkpoint
    return CheckpointFunction.apply(function, preserve, *args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/autograd/function.py", line 553, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 261, in forward
    outputs = run_function(*args)
              ^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/transformers/models/falcon/modeling_falcon.py", line 811, in forward
    attn_outputs = self.self_attention(
                   ^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gunala/miniconda3/envs/QLORAFTEnv/lib/python3.11/site-packages/transformers/models/falcon/modeling_falcon.py", line 450, in forward
    attn_output = F.scaled_dot_product_attention(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.11 GiB. GPU 0 has a total capacity of 10.75 GiB of which 340.50 MiB is free. Including non-PyTorch memory, this process has 10.42 GiB memory in use. Of the allocated memory 9.49 GiB is allocated by PyTorch, and 100.21 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [10]:
os.environ['HF_TOKEN'] = "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" # write token
hub_path = "aegunal/FT_IPD_gemma7b" #+ #model_id
lora_model.push_to_hub(hub_path)

adapter_model.safetensors:   0%|          | 0.00/51.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aegunal/FT_IPD_gemma7b/commit/1c8fb17c7003218ae7498a1f6a9c879e0f60bded', commit_message='Upload model', commit_description='', oid='1c8fb17c7003218ae7498a1f6a9c879e0f60bded', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
trainer.save_model("saved_models/FT_IPD_gemma")