In [None]:
import transformers
import torch

In [None]:
from datasets import load_dataset
from trl import SFTTrainer

# Load PEFT
from peft import (
        get_peft_model, 
        prepare_model_for_kbit_training, 
        LoraConfig
    )

In [None]:
TOKEN = ''  #Give HF access token here

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                            load_in_8bit=True,
                                            device_map="auto",
                                            token=TOKEN
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name,token=TOKEN)

In [None]:
tokenizer.decode(tokenizer.eos_token_id) # </s> is EOS

In [None]:
# We'll fine tune on summarization dataset
data = load_dataset("samsum")
data_train, data_test, data_val = data["train"], data["test"], data["validation"]

In [None]:
def generate_prompt(dialogue, summary=None, eos_token="</s>"):
  instruction = "Summarize the following:\n"
  input = f"{dialogue}\n"
  summary = f"Summary: {summary + ' ' + eos_token if summary else ''} "
  prompt = (" ").join([instruction, input, summary])
  return prompt

print(generate_prompt(data_train[0]["dialogue"], data_train[0]["summary"]))

In [None]:
#Checking performance of untuned model
input_prompt = generate_prompt(data_train[50]["dialogue"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=1000,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

In [None]:
#QLORA paper suggests fine tuning all linear layers

import re
model_modules = str(model.modules)
pattern = r'\((\w+)\): Linear'
linear_layer_names = re.findall(pattern, model_modules)

names = []
# Print the names of the Linear layers
for name in linear_layer_names:
    names.append(name)
target_modules = list(set(names))

In [None]:
model

In [None]:
print(target_modules)

In [None]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

In [None]:
tokenizer.special_tokens_map

In [None]:
lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules=target_modules,
        bias="none",
        task_type="CAUSAL_LM",
    )

In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [None]:
output_dir = "cp"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
per_device_eval_batch_size = 4
eval_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 5e-4
max_grad_norm = 0.3
max_steps = 30
warmup_ratio = 0.03
evaluation_strategy="steps"
lr_scheduler_type = "constant"

training_args = transformers.TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim=optim,
            evaluation_strategy=evaluation_strategy,
            save_steps=save_steps,
            learning_rate=learning_rate,
            logging_steps=logging_steps,
            max_grad_norm=max_grad_norm,
            max_steps=max_steps,
            warmup_ratio=warmup_ratio,
            group_by_length=True,
            lr_scheduler_type=lr_scheduler_type,
            ddp_find_unused_parameters=False,
            eval_accumulation_steps=eval_accumulation_steps,
            per_device_eval_batch_size=per_device_eval_batch_size,
        )

In [None]:
def formatting_func(prompt):
  output = []

  for d, s in zip(prompt["dialogue"], prompt["summary"]):
    op = generate_prompt(d, s)
    output.append(op)

  return output


trainer = SFTTrainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_val,
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args
)

# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

trainer.train()
trainer.save_model(f"{output_dir}/final")

In [None]:
#Test trained model performance

input_prompt = generate_prompt(data_train[50]["dialogue"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
    generation_output = model.generate(
        input_ids=input_tokens,
        max_new_tokens=100,
        do_sample=True,
        top_k=10,
        top_p=0.9,
        temperature=0.3,
        repetition_penalty=1.15,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
      )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

Loading saved model and inference

In [None]:
from peft import PeftModel
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                            load_in_8bit=True,
                                            device_map="auto",
                                            token=TOKEN
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name,token=TOKEN)
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))
peft_model_id = "cp/final"
peft_model = PeftModel.from_pretrained(model, peft_model_id, torch_dtype=torch.float16, offload_folder="lora_results/lora_7/temp")

In [None]:
input_prompt = generate_prompt(data_train[50]["dialogue"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
    generation_output = peft_model.generate(
        input_ids=input_tokens,
        max_new_tokens=100,
        do_sample=True,
        top_k=10,
        top_p=0.9,
        temperature=0.3,
        repetition_penalty=1.15,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
      )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)