code inspired from https://colab.research.google.com/drive/1vk8i01apaSp59GVV2yInxOV15QwCwMrg

In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [1]:
import os, torch, logging
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Dataset
data_name = "mlabonne/guanaco-llama2-1k"
training_data = load_dataset(data_name, split="train")
# Model and tokenizer names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
refined_model = "llama-2-7b-mlabonne-enhanced"
# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16
# Quantization Config

'''4bit config'''
# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_use_double_quant=False
# )

'''8bit config'''
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

print(len(training_data))

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.23s/it]


1000


In [3]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)
# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)
# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params
)
# Training
fine_tuning.train()
# save Model
fine_tuning.model.save_pretrained(refined_model)

  0%|          | 0/125 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.4476, 'learning_rate': 0.0002, 'epoch': 0.2}




{'loss': 1.4159, 'learning_rate': 0.0002, 'epoch': 0.4}




{'loss': 1.2843, 'learning_rate': 0.0002, 'epoch': 0.6}




{'loss': 1.3249, 'learning_rate': 0.0002, 'epoch': 0.8}


100%|██████████| 125/125 [10:00<00:00,  4.80s/it]

{'loss': 1.3748, 'learning_rate': 0.0002, 'epoch': 1.0}
{'train_runtime': 600.4294, 'train_samples_per_second': 1.665, 'train_steps_per_second': 0.208, 'train_loss': 1.3695145568847655, 'epoch': 1.0}





: 