In [1]:
from datasets import load_dataset
from random import randrange

from transformers import AutoTokenizer, set_seed, pipeline,BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer, GenerationConfig,LlamaConfig,LlamaModel,AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig 
import torch

from functools import partial
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM,PeftModel

import bitsandbytes as bnb

import os

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

In [6]:
print(len(dataset))

15011


In [4]:
dataset[1]

{'instruction': 'Which is a species of fish? Tope or Rope',
 'context': '',
 'response': 'Tope',
 'category': 'classification'}

## Setup Lora, BnB and Peft config

BnB config:

In [5]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

Peft and Lora config:

In [6]:
def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

Find all linear names (target modules)

In [7]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

### Pulling the model and creating tokenizer

In [8]:
def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB' #TODO Change if necessary

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto", 
        max_memory = {i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [9]:
model_id = 'meta-llama/Llama-2-70b-chat-hf'

bnb_config = create_bnb_config()

In [10]:
model, tokenizer = load_model(model_id, bnb_config)

Loading checkpoint shards: 100%|██████████| 15/15 [02:05<00:00,  8.38s/it]


## Data preprocessing

prompt format

In [11]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction', 'context', 'response')
    Then concatenate them using two newline characters 
    :param sample: Sample dictionnary
    """

    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"
    
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None
    response = f"{RESPONSE_KEY}\n{sample['response']}"
    end = f"{END_KEY}"
    
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    
    sample["text"] = formatted_prompt

    return sample

Get the max length of sequence (tokens) for the model.

In [12]:
def get_max_length(model):
    conf = model.config
    print('configuration: ', conf)
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

In [13]:
#test max length
max_length = get_max_length(model)

configuration:  LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-70b-chat-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 8192,
  "initializer_range": 0.02,
  "intermediate_size": 28672,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 64,
  "num_hidden_layers": 80,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false
  },
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.32.0.dev0",
  "use_cache": true,
  "vocab_size": 32000
}

Found 

tokenizing batches

In [14]:
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

preprocess dataset

In [15]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """
    
    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)
    print('dataset check:', dataset)
    
    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    print('preprocess func', _preprocessing_function)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "category","text"] ,
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    
    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [36]:
dataset[2]

{'instruction': 'Why can camels survive for long without water?',
 'context': '',
 'response': 'Camels use the fat in their humps to keep them filled with energy and hydration for long periods of time.',
 'category': 'open_qa'}

In [16]:
dataset = preprocess_dataset(tokenizer, max_length, 9016, dataset)

Preprocessing dataset...
dataset check: Dataset({
    features: ['instruction', 'context', 'response', 'category', 'text'],
    num_rows: 15011
})
preprocess func functools.partial(<function preprocess_batch at 0x7fa174d8d8b0>, max_length=4096, tokenizer=LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-70b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'pad_token': '</s>'}, clean_up_tokenization_spaces=False))


In [15]:
dataset[0]

{'input_ids': [1,
  13866,
  338,
  385,
  15278,
  393,
  16612,
  263,
  3414,
  29889,
  14350,
  263,
  2933,
  393,
  7128,
  2486,
  1614,
  2167,
  278,
  2009,
  29889,
  13,
  13,
  2277,
  29937,
  2799,
  4080,
  29901,
  13,
  5328,
  1784,
  5633,
  892,
  14982,
  716,
  297,
  278,
  7102,
  5955,
  29871,
  29929,
  29929,
  29941,
  29973,
  13,
  13,
  4290,
  29901,
  13,
  1576,
  29871,
  29929,
  29929,
  29941,
  471,
  1568,
  16710,
  975,
  322,
  3755,
  1422,
  515,
  967,
  27978,
  985,
  272,
  29889,
  7579,
  304,
  7102,
  5955,
  29892,
  1432,
  760,
  310,
  278,
  1559,
  471,
  8688,
  515,
  278,
  5962,
  701,
  29892,
  3704,
  278,
  6012,
  322,
  871,
  29871,
  29906,
  29900,
  29995,
  310,
  967,
  5633,
  892,
  8988,
  975,
  515,
  278,
  3517,
  12623,
  29889,
  7102,
  5955,
  14637,
  304,
  278,
  29871,
  29929,
  29929,
  29941,
  408,
  376,
  29874,
  7282,
  6564,
  29892,
  451,
  925,
  515,
  263,
  16905,
  29892,
  541,

## Helper functions

Printing the trainable parameters within the model

In [17]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

## Training Pipeline

In [18]:
def train(model, tokenizer, dataset, output_dir):
    # Apply preprocessing to the model to prepare it by
    # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()
    print('gradient checkpointing check: ', model.gradient_checkpointing_enable())

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    print('check model before kbit: ', model)
    model = prepare_model_for_kbit_training(model)
    print('check model after kbit: ', model)

    # Get lora module names
    modules = find_all_linear_names(model)
    print('check target modules: ', modules)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(modules)
    print('check peft config: ', peft_config)

    model = get_peft_model(model, peft_config)
    print('check model after peft', model)
    
    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)
    
    # Training parameters
    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        args=TrainingArguments(
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            max_steps=50,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=1,
            output_dir="outputs",
            optim="paged_adamw_8bit",
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    
    model.config.use_cache = False 
    
    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)
    print('dtypes:', dtypes)

    do_train = True
    
    # Launch training
    print("Training...")
    
    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)    
    
    ###
    
    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)
    
    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()
     


In [19]:
output_dir = "results/llama2/final_checkpoint"

In [20]:
train(model, tokenizer, dataset, output_dir)

gradient checkpointing check:  None
check model before kbit:  LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 8192)
    (layers): ModuleList(
      (0-79): 80 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=8192, out_features=8192, bias=False)
          (k_proj): Linear4bit(in_features=8192, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=8192, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=8192, out_features=8192, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=8192, out_features=28672, bias=False)
          (up_proj): Linear4bit(in_features=8192, out_features=28672, bias=False)
          (down_proj): Linear4bit(in_features=28672, out_features=8192, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (pos

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,1.81
2,2.2271
3,2.5391
4,1.4079
5,1.41
6,1.252
7,1.1045
8,1.1082
9,1.4179
10,1.037


***** train metrics *****
  epoch                    =       0.01
  total_flos               =  9766870GF
  train_loss               =     1.1332
  train_runtime            = 0:11:15.70
  train_samples_per_second =      0.296
  train_steps_per_second   =      0.074
{'train_runtime': 675.7061, 'train_samples_per_second': 0.296, 'train_steps_per_second': 0.074, 'total_flos': 1.0487097544015872e+16, 'train_loss': 1.1331523597240447, 'epoch': 0.01}
Saving last checkpoint of the model...


In [2]:
model_NAME = 'meta-llama/Llama-2-70b-chat-hf'
new_model = "results/llama2/final_checkpoint"

In [3]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_NAME,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map = "auto",    
)

Loading checkpoint shards: 100%|██████████| 15/15 [00:10<00:00,  1.44it/s]


In [4]:
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

In [5]:
model.save_pretrained("fine-tuned-llama2")

In [7]:
total_params_b = sum(p.numel() for p in model.parameters())
total_params_b

68976648192

In [8]:
total_params_n = sum(p.numel() for p in base_model.parameters())
total_params_n

68976648192

In [23]:
WEIGHTS = "./results/llama2/final_checkpoint/"
MODEL = 'meta-llama/Llama-2-70b-chat-hf'

model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map="auto",
    offload_folder="./offload/"
)
model = PeftModel.from_pretrained(
        model, 
        WEIGHTS, 
        torch_dtype=torch.float16,
        device_map="auto",
        offload_folder="offload", 
        
    )
model = model.merge_and_unload()
model.save_pretrained("llama_evo")

Loading checkpoint shards: 100%|██████████| 15/15 [00:06<00:00,  2.22it/s]


KeyboardInterrupt: 

In [20]:
from accelerate import disk_offload
disk_offload(model, offload_dir="offload/")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 8192)
    (layers): ModuleList(
      (0-79): 80 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(
            in_features=8192, out_features=8192, bias=False
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=8192, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=8192, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear4bit(
            in_features=8192, out_features=1024, bias=False
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (de

In [21]:
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

output_merged_dir = "results/llama2/final_merged_checkpoint"
os.makedirs(output_merged_dir, exist_ok=True)
model.save_pretrained(output_merged_dir, safe_serialization=True)

# save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(output_merged_dir)

Loading checkpoint shards: 100%|██████████| 15/15 [00:28<00:00,  1.92s/it]


ValueError: We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules need to be offloaded: base_model.model.model.layers.77, base_model.model.model.layers.78, base_model.model.model.layers.79, base_model.model.model.norm, base_model.model.lm_head.

## Agents test

In [2]:
from langchain.llms import HuggingFacePipeline
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
llama_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task='text-generation',
    temperature=0.01,
    repetition_penalty=1.1,
)

In [None]:
llm = HuggingFacePipeline(pipeline=llama_pipeline)