## LLM fine tuning with self-instruct datasets


This notebook provides a comprehensive guide for fine tuning the Mistral-7B-instruct model with LoRA, using the training/validation datasets synthetically generated from the self-instruct framework.  
     
This is the step 5 of the self-instruct framework, continuing from the "synthetic_data_generation_self-instruct_notebook".

### Initialization

In [None]:
# Uncomment the following lines to install the required Python packages 

#!pip install transformers
#!pip install datasets
#!pip install accelerate
#!pip install bitsandbytes
#!pip install peft

In [None]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling

import torch
from torch.utils.data import Dataset, random_split
from transformers import TrainingArguments, Trainer

### Load the synthetic datasets

Define prompt keys

In [None]:
INSTRUCTION_KEY = "<s>[INST]"
INPUT_KEY = "Input:"
RESPONSE_KEY = "[/INST]"
END_KEY = "</s>"
#RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
DEFAULT_SEED = 42

# This is a training prompt that does not contain an input string.  The instruction by itself has enough information
# to respond.  For example, the instruction might ask for the year a historic figure was born.
PROMPT_NO_INPUT_FORMAT = """{instruction_key}{instruction}{response_key}{response}{end_key}""".format(
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

# This is a training prompt that contains an input string that serves as context for the instruction.  For example,
# the input might be a passage from Wikipedia and the intruction is to extract some information from it.
PROMPT_WITH_INPUT_FORMAT = """{instruction_key}{instruction}{input_key}{input}{response_key}{response}{end_key}""".format(
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    input_key=INPUT_KEY,
    input="{input}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

Prepare training data in Datasets format

In [None]:
from datasets import Dataset, load_dataset

html_file = <your document.html> 
TRN_FILE = html_file+"_trnfromseed_mistral_20240505.csv"  

dataset = load_dataset("csv", data_files=TRN_FILE)

In [None]:
def _add_text(rec):
        instruction = rec["question"]
        response = rec["answer"]
        context = rec.get("context")
        
        if not instruction:
            raise ValueError(f"Expected an instruction in: {rec}")

        if not response:
            raise ValueError(f"Expected a response in: {rec}")

        rec["text"] = PROMPT_NO_INPUT_FORMAT.format(instruction=(instruction.replace('\n','')).replace('  ',''),
                                                    response=(response.replace('\n','')).replace('  ','')+'  [STOP][STOP]'
                                                   ).strip()

        return rec

In [None]:
# Add prompt keys to dataset
dataset = dataset.map(_add_text)

# Remove data fields not needed 
dataset = dataset.map(
        batched=True,
        remove_columns=['context', 'seed_question', 'question', 'answer'],
)

### LLM Fine tuning

Here we use the Mistral-7b-instruct model for PEFT/LoRA based fine-tuning. Load the model first.

In [None]:
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
OUTPUT_DIR = 'models/mistral-ft-doc-d2lai-self-instruct-20240505'

device_map="auto"
TOKEN = <your token>

In [None]:
model = AutoModelForCausalLM.from_pretrained(      
    BASE_MODEL, 
    torch_dtype=torch.bfloat16,             
    device_map=device_map,
    token = TOKEN,
)

In [None]:
model.config.use_cache = False  
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL, 
    token = TOKEN,
)

tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

Set LoRA and training parameters

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

model = prepare_model_for_kbit_training(model)

# Define LoRA Config
lora_config = LoraConfig(
 r=256,                   
 lora_alpha=64,                        
 lora_dropout=0.05, 
 target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj"], 
 bias="none",
 task_type="CAUSAL_LM"
)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(    
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=5e-6,
        num_train_epochs=3,
        logging_strategy="steps",
        logging_steps=20,   
        save_strategy="steps",
        save_steps=20000,
        save_total_limit=10,
        fp16=False,
        bf16=False,
        max_grad_norm=0.3,
        max_steps=-1,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="constant",
        report_to=None,
)

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    peft_config=lora_config,
    max_seq_length= None,
    dataset_text_field="text",
    args=training_args,
    packing= False,
)

In [None]:
# Start the training process

import time

st = time.time()

trainer.train()

et = time.time()
elapsed_time = et - st
print('Training time:', elapsed_time, 'seconds')

In [None]:
# Save the trained LoRA adapter

trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)

### Inference the fine-tuned model using transformers pipeline

In [None]:
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer = tokenizer, 
    torch_dtype=torch.bfloat16, 
    device_map=device_map
)

In [None]:
from transformers import pipeline

def Mistral_Infer(query):

    st = time.time()
    
    sequences = pipe(
        f"<s>[INST] {query} [/INST]",
        do_sample=True,
        max_new_tokens=512, 
        temperature=0.05, 
        top_p=0.95,
        num_return_sequences=1,
        repetition_penalty=1.1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

    et = time.time()
    elapsed_time = et - st
    
    full_text = sequences[0]['generated_text'].split('[STOP][STOP]')[0]
    answer = full_text.split('[/INST]')[1]
    
    return answer, elapsed_time


In [None]:
import pandas as pd

VAL_FILE = html_file+"_valfromseed_mistral-7b.csv"  

df_val = pd.read_csv (VAL_FILE)
question_list = df_val.question.values.tolist()
ref_answer_list = df_val.answer.values.tolist()

In [None]:
IDX=5
query = question_list[IDX]  
ref_answer = ref_answer_list[IDX]

answer,elapsed_time = Mistral_Infer(query)

print("[Question]:  ",query,'\n\n',"[Answer]:  ",answer,'\n\n',"[Ref_Answer]:  ",ref_answer)
print("\nInference time = ", elapsed_time, " seconds")