In [14]:
!pip install "transformers==4.31.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2" "trl==0.4.7" "safetensors>=0.3.1" --upgrade --quiet

In [15]:
import torch
from random import randrange
from datasets import load_dataset
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoTokenizer

# Load Pubmed Q/A dataset

In [16]:
dataset = load_dataset("pubmed_qa", "pqa_labeled", split="train")



In [17]:
print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])

dataset size: 1000
{'pubid': 19054501, 'question': 'Is motion perception deficit in schizophrenia a consequence of eye-tracking abnormality?', 'context': {'contexts': ['Studies have shown that schizophrenia patients have motion perception deficit, which was thought to cause eye-tracking abnormality in schizophrenia. However, eye movement closely interacts with motion perception. The known eye-tracking difficulties in schizophrenia patients may interact with their motion perception.', 'Two speed discrimination experiments were conducted in a within-subject design. In experiment 1, the stimulus duration was 150 msec to minimize the chance of eye-tracking occurrence. In experiment 2, the duration was increased to 300 msec, increasing the possibility of eye movement intrusion. Regular eye-tracking performance was evaluated in a third experiment.', 'At 150 msec, speed discrimination thresholds did not differ between schizophrenia patients (n = 38) and control subjects (n = 33). At 300 msec,

# Load Pretraied Llama-2 Model

In [18]:
def format_instruction(sample):
	return f"""### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM.

### Input:
{sample['long_answer']}

### Response:
{sample['question']}
"""

In [19]:
from random import randrange

format_instruction(dataset[randrange(len(dataset))])

'### Instruction:\nUse the Input below to create an instruction, which could have been used to generate the input using an LLM.\n\n### Input:\nThe association between plasma glucose levels and CVD risk is mainly explained by insulin resistance, which raises the question of whether glucose lowering per se without changes in the processes that underlie hyperglycemia should be the sole clinical paradigm in the treatment of type 2 diabetes or its prevention.\n\n### Response:\nDoes insulin resistance drive the association between hyperglycemia and cardiovascular risk?\n'

In [20]:
!python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
!pip install ninja packaging
!MAX_JOBS=4 pip install flash-attn --no-build-isolation

Collecting flash-attn
  Using cached flash_attn-2.0.2.tar.gz (4.2 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash-attn)
  Using cached einops-0.6.1-py3-none-any.whl (42 kB)
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.0.2-cp310-cp310-linux_x86_64.whl size=121473226 sha256=bc9f279dda7f714d8525f08c198fe9e5e2331923659cde17962fd1c427fcc244
  Stored in directory: /root/.cache/pip/wheels/f1/28/cc/12adb10d502288f0a486c21f915fad842b17cb8d2fc5564e1c
Successfully built flash-attn
Installing collected packages: einops, flash-attn
Successfully installed einops-0.6.1 flash-attn-2.0.2


In [22]:
!pip install llama-trainer

Collecting llama-trainer
  Downloading llama_trainer-0.2.0-py3-none-any.whl (11 kB)
Collecting black (from llama-trainer)
  Downloading black-23.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Collecting mypy-extensions>=0.4.3 (from black->llama-trainer)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Collecting pathspec>=0.9.0 (from black->llama-trainer)
  Downloading pathspec-0.11.2-py3-none-any.whl (29 kB)
Installing collected packages: pathspec, mypy-extensions, black, llama-trainer
Successfully installed black-23.7.0 llama-trainer-0.2.0 mypy-extensions-1.0.0 pathspec-0.11.2


In [26]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

use_flash_attention = False
# COMMENT IN TO USE FLASH ATTENTION
# replace attention with flash attention
if torch.cuda.get_device_capability()[0] >= 8:
    from llama_trainer.utils.llama_patch import replace_attn_with_flash_attn
    print("Using flash attention")
    replace_attn_with_flash_attn()
    use_flash_attention = True


# Hugging Face model id
model_id = "NousResearch/Llama-2-7b-hf" # non-gated
# model_id = "meta-llama/Llama-2-7b-hf" # gated


# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, use_cache=False, device_map="auto")
model.config.pretraining_tp = 1

# Validate that the model is using flash attention, by comparing doc strings
if use_flash_attention:
    from llama_trainer.utils.llama_patch import forward
    assert model.model.layers[0].self_attn.forward.__doc__ == forward.__doc__, "Model is not using flash attention"


tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Using flash attention


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [27]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
)


# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)


In [28]:
from transformers import TrainingArguments

# args = TrainingArguments(
#     output_dir="llama-7-int4-pubmed",
#     num_train_epochs=3,
#     per_device_train_batch_size=6 if use_flash_attention else 4,
#     gradient_accumulation_steps=2,
#     gradient_checkpointing=True,
#     optim="paged_adamw_32bit",
#     logging_steps=10,
#     save_strategy="epoch",
#     learning_rate=2e-4,
#     fp16=True,
#     max_grad_norm=0.3,
#     warmup_ratio=0.03,
#     lr_scheduler_type="constant",
#     disable_tqdm=True # disable tqdm since with packing values are in correct
# )

# output_dir = "llama-7-int4-pubmed"
# per_device_train_batch_size = 4
# gradient_accumulation_steps = 4
# optim = "paged_adamw_32bit"
# save_steps = 100
# logging_steps = 10
# learning_rate = 2e-4
# max_grad_norm = 0.3
# max_steps = 100
# warmup_ratio = 0.03
# lr_scheduler_type = "constant"

# args = TrainingArguments(
#     output_dir=output_dir,
#     per_device_train_batch_size=per_device_train_batch_size,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     optim=optim,
#     save_steps=save_steps,
#     logging_steps=logging_steps,
#     learning_rate=learning_rate,
#     fp16=True,
#     max_grad_norm=max_grad_norm,
#     max_steps=max_steps,
#     warmup_ratio=warmup_ratio,
#     lr_scheduler_type=lr_scheduler_type,
# )

args = TrainingArguments(
    output_dir="llama-7-int4-pubmed",
    num_train_epochs=3,
    per_device_train_batch_size=6 if use_flash_attention else 4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=True # disable tqdm since with packing values are in correct
)

In [29]:
from trl import SFTTrainer

max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction,
    args=args,
)

In [30]:
trainer.train() # there will not be a progress bar since tqdm is disabled


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.7508, 'learning_rate': 0.0002, 'epoch': 1.05}
{'train_runtime': 143.8132, 'train_samples_per_second': 20.86, 'train_steps_per_second': 1.731, 'train_loss': 1.6924525499343872, 'epoch': 2.06}


TrainOutput(global_step=16, training_loss=1.6924525499343872, metrics={'train_runtime': 143.8132, 'train_samples_per_second': 20.86, 'train_steps_per_second': 1.731, 'train_loss': 1.6924525499343872, 'epoch': 2.06})

In [31]:
trainer.save_model()

In [32]:
if use_flash_attention:
    # unpatch flash attention
    from llama_trainer.utils.llama_patch import unplace_flash_attn_with_attn
    unplace_flash_attn_with_attn()

import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

args.output_dir = "/content/llama-7-int4-pubmed"

# load base LLM model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(args.output_dir)


Reloading llama model, unpatching flash attention


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [33]:
from datasets import load_dataset
from random import randrange


# Load dataset from the hub and get a sample
dataset = load_dataset("pubmed_qa", "pqa_labeled", split="train")
sample = dataset[randrange(len(dataset))]

prompt = f"""### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM.

### Input:
{sample['long_answer']}

### Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)

print(f"Prompt:\n{sample['long_answer']}\n")
print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"Ground truth:\n{sample['question']}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prompt:
Although planning and operating a multidisciplinary clinic is not a new venture, to the best of the authors' knowledge, they have provided the first report demonstrating the benefits described above.

Generated instruction:
The use of the LLM generated instruction could be used to create an instruction, which could have been used to generate the input using an LLM.

### Instruction:
Although planning and operating a multidisciplinary clinic is not a new venture, to the best of the authors' knowledge, they have provided the first report demonstrating the benefits described above.

### Input:
Although planning and operating a multidisciplinary clinic
Ground truth:
Multidisciplinary breast cancer clinics. Do they work?


In [34]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir,
    low_cpu_mem_usage=True,
)

# Merge LoRA and base model
merged_model = model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('merged_model/tokenizer_config.json',
 'merged_model/special_tokens_map.json',
 'merged_model/tokenizer.json')

In [40]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [41]:
# push merged model to the hub
merged_model.push_to_hub("botch/Llama-2-7b-pubmed")
tokenizer.push_to_hub("botch/Llama-2-7b-pubmed-tokenizer")

CommitInfo(commit_url='https://huggingface.co/botch/Llama-2-7b-pubmed-tokenizer/commit/2296e3ca02361914a10995b91b0e71a1db7808f7', commit_message='Upload tokenizer', commit_description='', oid='2296e3ca02361914a10995b91b0e71a1db7808f7', pr_url=None, pr_revision=None, pr_num=None)