<a href="https://colab.research.google.com/github/ankits1089/FineTune-LLM/blob/main/Finetune_llm_PEFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [69]:
# #install PEFT & datasets
# !pip install datasets
# !pip install -U peft

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # all output p


In [2]:
# Automatically detect device (GPU > MPS > CPU)
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

print(f"Using device: {device}")

Using device: cuda


In [70]:
# # Load the model and tokenizer
model_name = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load model & move to device
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)


In [4]:
# tokenize prompt
tokenizer.pad_token_id = tokenizer.eos_token_id
inputs = tokenizer("What is AI?", return_tensors="pt",padding=True, truncation=True,max_length=100)

In [5]:
# generation
outputs = model.generate(**inputs.to(device))
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

What is AI?

AI is a computer program that is able to learn and adapt to the environment. It is


In [6]:
# Total number of paramters
model.num_parameters()


331196416

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%cd drive/My Drive/Data Science/Projects

/content/drive/My Drive/Data Science/Projects


In [8]:
# create a question and answer set in a list with dictionary

from datasets import Dataset

QA_list = []

with open ('QA.txt') as doc:
    Q_counter = 0
    for line in doc:
        if 'Question' in line:
            Q_dict = {'question':line.replace('Question:"',"").strip().strip('"'),'answer':''}
            QA_list.append(Q_dict)
            Q_counter+=1
        elif 'Answer' in line:
            A_counter = Q_counter-1
            modified_line = line.replace('Answer:"',"").strip().strip('"')
            QA_list[A_counter]['answer'] = modified_line
        else:
            pass

dataset = Dataset.from_list(QA_list)

In [9]:
# check dataest
dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 77
})

In [10]:
# Tokenize as question answer pair which will be used for training

def tokenize_function(examples):
    text = examples["question"] + examples["answer"]


    tokenizer.pad_token = tokenizer.eos_token

    tokenizer.padding_side = "left"
    tokenizer.truncation_side = "right"
    tokenized_inputs = tokenizer(
        text,
        # return_tensors="pt",
        padding="max_length",
        # padding=True,
        truncation=True,
        max_length=30
    )

    return tokenized_inputs

In [72]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=False,
    # batch_size=1,
    # drop_last_batch=True
)

print(tokenized_dataset)

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 77
})


In [12]:
# adding labels
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [13]:
# Prepare train test split
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 69
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8
    })
})


In [14]:
def inference(text, model, tokenizer, max_input_tokens=512, max_output_tokens=30):
  # Tokenize
  input_ids = tokenizer(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  # device = "mps" if torch.backends.mps.is_available() else "cpu"

  generated_tokens_with_prompt = model.generate(
      **input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

In [15]:
test_text = split_dataset['test'][3]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {split_dataset['test'][3]['answer']}")
print("\nModel's answer: ")
print(inference(test_text, model, tokenizer))

Question input (test): Why is insisting that you are 'right' and the other person is 'wrong' considered bad communication?
Correct answer from Lamini docs: Because it creates conflict and prevents understanding between both parties.

Model's answer: 

Because it's not a matter of


In [16]:
# create lora config

from peft import LoraConfig, TaskType, get_peft_model

# create LoRA configuration object
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, # type of task to train on
    inference_mode=False, # set to False for training
    target_modules=["q_proj", "v_proj"],
    r=8, # dimension of the smaller matrices
    lora_alpha=32, # scaling factor
    lora_dropout=0.1, # dropout of LoRA layers
    modules_to_save=["lm_head"]
)

In [17]:
# model.unload()
# model.delete_adapter('default')

In [18]:
# using data collator for batch processing
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer,padding=True)

In [19]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # The number of parameters is high as lm head is being also trained.
# Though the result is coming to be similar without lm_head for similar number of steps (considerably less paramters to be trained: 700K)

trainable params: 26,525,696 || all params: 357,722,112 || trainable%: 7.4152


In [20]:
# model config
model.peft_config

{'default': LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='facebook/opt-350m', revision=None, inference_mode=False, r=8, target_modules={'v_proj', 'q_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=['lm_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)}

In [46]:
from transformers import TrainingArguments, Trainer

# initialising training arguments
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-6,

  # Number of training epochs
  num_train_epochs=12,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=-1,

  # Batch size for training
  per_device_train_batch_size=4,

  # Directory to save model checkpoints
  output_dir='fine_tuned_opt_350m_final',

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=4, # Number of update steps between two evaluations
  save_steps=16, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=4, # Batch size for evaluation
  eval_strategy="steps",
  logging_strategy="steps",
  # fp16=False,
  logging_steps=8,
  optim="adafactor",
  gradient_accumulation_steps = 1,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=2,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

In [47]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    data_collator=data_collator)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [48]:
# Train the model
# trainer.train()
trainer.train(resume_from_checkpoint=True)

Step,Training Loss,Validation Loss
184,0.186,2.065768
188,0.186,2.067105
192,0.1976,2.075464
196,0.1976,2.082425
200,0.2048,2.093981
204,0.2048,2.09886
208,0.1844,2.099129
212,0.1844,2.100057
216,0.193,2.102139


TrainOutput(global_step=216, training_loss=0.03232844643018864, metrics={'train_runtime': 9.4661, 'train_samples_per_second': 87.47, 'train_steps_per_second': 22.818, 'total_flos': 49165858897920.0, 'train_loss': 0.03232844643018864, 'epoch': 12.0})

In [68]:
# # load the finetuned model

# from peft import PeftModel, PeftConfig
# from transformers import AutoModelForCausalLM

# config = PeftConfig.from_pretrained("fine_tuned_opt_350m_final/checkpoint-144")
# model_base = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
# model_finetuned = PeftModel.from_pretrained(model_base,
# "fine_tuned_opt_350m_final/checkpoint-144",
#  is_trainable=True
# )

Check the model output on the question - what is the larget lake?

In [55]:
# tokenize prompt
question = "What is the largest lake?"
tokenizer.pad_token_id = tokenizer.eos_token_id
inputs = tokenizer(question, return_tensors="pt",padding=True, truncation=True,max_length=100)


In [64]:
# generation
outputs = model.generate(**inputs.to(device))
print("Question: ",question,"\nAnswer:",tokenizer.decode(outputs[0], skip_special_tokens=True)[len(question):])

Question:  What is the largest lake? 
Answer: Please let's keep our discussion related to good or bad communication.Please let's keep our discussion related


Compare the result with that of the base model

In [43]:
model_name = "facebook/opt-350m"
# Load model & move to device
model_base = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)


In [57]:
# generation
outputs = model_base.generate(**inputs.to(device))
print("Question: ",question,"\nAnswer:",tokenizer.decode(outputs[0], skip_special_tokens=True)[len(question):])

Question:  What is the largest lake? 
Answer: 
Lake of the Gods.


Trained model output from QA document vs base model

In [62]:
# mention question number
qno = 35

test_text = split_dataset['train'][qno]['question']
print("Question input (test):", test_text)
print(f"\nCorrect answer from Lamini docs: {split_dataset['train'][qno]['answer']}")
print("\nModel's answer: ")
print(inference(test_text, model, tokenizer))
print("\nBase model's answer: ")
print(inference(test_text, model_base, tokenizer))

Question input (test): What is feeling empathy?

Correct answer from Lamini docs: Acknowledging how the other person is feeling based on what they have expressed.

Model's answer: 
It helps you understand the other person’s point of view.It helps you understand their point of view.It

Base model's answer: 


What is feeling empathy?

What is feeling empathy?

What is feeling empathy?

What
