In [1]:
import sys
sys.path.append("../src")

In [2]:
import torch
import transformers

from transformers import Trainer
from config import ModelArguments, DataArguments, TrainingArguments
from dataset import SupervisedDataset, DataCollatorForSupervisedDataset, smart_tokenizer_and_embedding_resize

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_args, data_args, training_args = ModelArguments, DataArguments, TrainingArguments

In [4]:
model_args.model_name_or_path = "openlm-research/open_llama_3b_v2"

data_args.data_path = "../alpaca_data.json"

In [5]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [22]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32
)

model = LlamaForCausalLM.from_pretrained(model_args.model_name_or_path, quantization_config=bnb_config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [23]:
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

tokenizer = transformers.LlamaTokenizer.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=training_args.cache_dir,
        model_max_length=training_args.model_max_length,
        max_length=training_args.model_max_length,
        use_fast=False,
    )
special_tokens_dict = dict()
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

smart_tokenizer_and_embedding_resize(
    special_tokens_dict=special_tokens_dict,
    tokenizer=tokenizer,
    model=model,
)

In [24]:
train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
data_module = dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)



In [25]:
print(train_dataset[4]['input_ids'])

tensor([    1, 10705,   325,   371, 10211,   347, 10801,   260,  4516, 29520,
         9078,   260,  2805,   347, 20488, 28963,   268,  2517, 29520,    13,
           13,  3093, 29586, 25712, 29537,    13, 28420,   260,   632,   661,
          333,   663,   290,   783,   260,  2807,  3513, 29520,    13,    13,
         3093, 29586, 11343, 29537, 29528,   663,   290,   783,   260,  2807,
         3513,   661,   306,   425,  1686,   372,   260,  1421,  5287,   410,
          260,  4618,  1542, 29520,   306,   425,   293,  4430,   296,   260,
         1421,   347,  2590,   290,   339,  5311,   443,   260,  1975,  2659,
          293,  1558,   290,  1771,   268,  3446, 29564, 29508,  8714, 29520,
         1981, 29522,  2374,   290, 10878, 19778, 29522,   389,   679,   437,
         1581,   290,  1771,   268, 14111,   295,   528,   306,   663,   290,
          783,   260,  2807,  3513, 29520,   306,  3694,   290,  9820,   268,
        14111, 29522,   510,   306,   663,   290,  8640,   268, 

In [26]:
print(len(train_dataset[4]['input_ids']))

197


In [27]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [28]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [29]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 3200)
    (layers): ModuleList(
      (0-25): 26 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=3200, out_features=3200, bias=False)
          (k_proj): Linear4bit(in_features=3200, out_features=3200, bias=False)
          (v_proj): Linear4bit(in_features=3200, out_features=3200, bias=False)
          (o_proj): Linear4bit(in_features=3200, out_features=3200, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3200, out_features=8640, bias=False)
          (up_proj): Linear4bit(in_features=3200, out_features=8640, bias=False)
          (down_proj): Linear4bit(in_features=8640, out_features=3200, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaR

In [30]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

# model = get_peft_model(model, config)
# print_trainable_parameters(model)

# # Apply the accelerator. You can comment this out to remove the accelerator.
# model = accelerator.prepare_model(model)

In [31]:
model.add_adapter(lora_config, adapter_name="adapter_1")

In [32]:
import wandb, os
wandb.login()

wandb_project = "llama3-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project



In [33]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [34]:
from datetime import datetime

project = "llama3-finetune"
base_model_name = "llama3"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

In [None]:
train_dataset = train_dataset.remove_columns(books_dataset["train"].column_names)

In [38]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=None,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=1000,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=50,
        bf16=False,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps            # Evaluate and save checkpoints every 50 steps
        do_eval=False,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

max_steps is given, it will override any value given in num_train_epochs


In [36]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).