In [1]:
import os
os.environ['HF_TOKEN'] = ""
os.environ['CUDA_VISIBLE_DEVICES'] = "2,3"
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# which model to ft
model_id = "google/gemma-7b" # "meta-llama/Llama-2-7b-hf"
                                      # "meta-llama/Llama-2-13b-hf"
                                      # "openai-community/gpt2"
                                      # "mistralai/Mistral-7B-v0.1"
                                      # "google/gemma-7b"
                                      # "tiiuae/falcon-7b"
model_type = "causal" # "sequential" "causal"

from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [3]:
from transformers import AutoTokenizer, \
                         AutoModelForCausalLM, \
                         AutoModelForSequenceClassification

if model_id == "meta-llama/Llama-2-7b-hf" or model_id == "meta-llama/Llama-2-13b-hf" :
    from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaForSequenceClassification
    tokenizer = LlamaTokenizer.from_pretrained(model_id)
    if model_type == "causal":
        model = LlamaForCausalLM.from_pretrained(model_id,
                                                quantization_config=bnb_config,)
    else:
        model = LlamaForSequenceClassification.from_pretrained(model_id,
                                                quantization_config=bnb_config,)
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
elif model_id == "google/gemma-7b":
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if model_type == "causal":
        model = AutoModelForCausalLM.from_pretrained(model_id,
                                                    quantization_config=bnb_config)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                                    quantization_config=bnb_config)
elif model_id == "mistralai/Mistral-7B-v0.1":
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if model_type == "causal":
        model = AutoModelForCausalLM.from_pretrained(model_id,
                                                    quantization_config=bnb_config)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                                    quantization_config=bnb_config)
elif model_id == "tiiuae/falcon-7b":
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if model_type == "causal":
        model = AutoModelForCausalLM.from_pretrained(model_id,
                                                    quantization_config=bnb_config)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                                    quantization_config=bnb_config)
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
elif model_id == "openai-community/gpt2":
    from transformers import GPT2Tokenizer, GPT2Model, GPT2ForSequenceClassification
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    if model_type == "causal":
        model = GPT2Model.from_pretrained('gpt2')
    else:
        model = GPT2ForSequenceClassification.from_pretrained('gpt2')
else:
    model = None
    tokenizer = None


tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [4]:
# check lora trainable layers
def check_lora_trainable_layers():
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name, param.shape)

In [5]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

target_modules = []
if (model_id == "meta-llama/Llama-2-7b-hf") or \
   (model_id == "google/gemma-7b"):
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
elif model_id == "mistralai/Mistral-7B-v0.1":
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "down_proj", "up_proj", "gate_proj"]
elif model_id == "tiiuae/falcon-7b":
    target_modules = ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=target_modules,
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    modules_to_save=['weight']
)

lora_model = get_peft_model(model, lora_config)

In [6]:
# get the data ready
from utils import *
data_ds = format_dat(tokenizer=tokenizer)

Map (num_proc=4):   0%|          | 0/12358 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maxim

In [8]:
# fine-tuning!
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

output_dirname = "saved_models/FT_" + model_id

training_args = TrainingArguments(
      per_device_train_batch_size=1,
      per_device_eval_batch_size=1,
      gradient_accumulation_steps=4, # this is for optimization
      evaluation_strategy='epoch',
      num_train_epochs=1,
      warmup_steps=2,
     # max_steps=1, # overrides num_train_epochs
      learning_rate=2e-4,
      fp16=True, # this is for optimization
      logging_steps=1,
      output_dir=output_dirname,
      optim="paged_adamw_8bit"
)

if model_type == "causal":
    trainer = Trainer(
        model=lora_model,
        args=training_args,
        train_dataset=data_ds['train'],
        eval_dataset=data_ds['test'],
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )
else:
    pass # need to update data_collator for seq cls

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss


In [None]:
os.environ['HF_TOKEN'] = "" # write token
hub_path = "aegunal/FT_IPD_gemma7b" #+ #model_id
lora_model.push_to_hub(hub_path)

adapter_model.safetensors:   0%|          | 0.00/54.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aegunal/FT_IPD_mistral7b/commit/1f7ccb98187df75fb7f796ca57c01b2a097fc079', commit_message='Upload model', commit_description='', oid='1f7ccb98187df75fb7f796ca57c01b2a097fc079', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
trainer.save_model("saved_models/FT_IPD_mistr")