# Fine-tune Mistral-7b with DPO

In [37]:
%pip install --quiet --upgrade\
    pip \
    accelerate\
    peft\
    bitsandbytes\
    transformers\
    trl\
    sentencepiece

Note: you may need to restart the kernel to use updated packages.


# Import Necessary Libraries and Packages

In [38]:
import torch

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)

from peft import LoraConfig

from trl import DPOConfig, DPOTrainer

# Load Model and Tokenizer

In [39]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [40]:
model_name = "teknium/OpenHermes-2.5-Mistral-7B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

# Model to fine-tune
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
model.config.use_cache = False

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.34s/it]


In [41]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Load Dataset for finetuning

In [42]:
# Load dataset
dataset = load_dataset("Intel/orca_dpo_pairs")
dataset

DatasetDict({
    train: Dataset({
        features: ['system', 'question', 'chosen', 'rejected'],
        num_rows: 12859
    })
})

In [43]:
dataset = dataset['train']

# Save columns
original_columns = dataset.column_names
original_columns

['system', 'question', 'chosen', 'rejected']

In [44]:
# Print sample
dataset[1]

{'system': 'You are an AI assistant. You will be given a task. You must generate a detailed and long answer.',
 'question': 'Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One',
 'chosen': 'Midsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.',
 'rejected': ' Sure! Here\'s a sentence that describes all the data you provided:\n\n"Midsummer House is a moderately priced Chinese restaurant with a customer rating of 3 out of 5, located near All Bar One, offering a variety of delicious dishes."'}

## Format dataset to apply mixtral chat template

In [45]:
def chatml_format(example):
    # Format system
    if len(example['system']) > 0:
        message = {"role": "system", "content": example['system']}
        system = tokenizer.apply_chat_template([message], tokenize=False)
    else:
        system = ""

    # Format instruction
    message = {"role": "user", "content": example['question']}
    prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)

    # Format chosen answer
    chosen = example['chosen'] + "<|im_end|>\n"

    # Format rejected answer
    rejected = example['rejected'] + "<|im_end|>\n"

    return {
        "prompt": system + prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

In [46]:
# Format dataset
dataset = dataset.map(
    chatml_format,
    remove_columns=original_columns
).shuffle(seed=42).select(range(1000))

In [47]:
dataset

Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 1000
})

# Finetune the model with DPO Trainer

## Find target modules in the modules to finetune

In [48]:
def find_target_modules(model):
    # Initialize a Set to Store Unique Layers
    unique_layers = set()
    
    # Iterate Over All Named Modules in the Model
    for name, module in model.named_modules():
        # Check if the Module Type Contains 'Linear4bit'
        if "Linear4bit" in str(type(module)):
            # Extract the Type of the Layer
            layer_type = name.split('.')[-1]
            
            # Add the Layer Type to the Set of Unique Layers
            unique_layers.add(layer_type)

    # Return the Set of Unique Layers Converted to a List
    return list(unique_layers)

In [49]:
find_target_modules(model)

['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'gate_proj', 'down_proj']

## Lora Configurations

In [50]:
## we define the LoRA configurations to train the model. r=16, lora_alpha = 2*r (usually) but here we are usign the same

In [51]:
# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

## Defining DPO Configurations

In [52]:
# Training arguments
training_args = DPOConfig(
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=-1,
    save_steps=25,
    logging_steps=1,
    output_dir='./mixtral',
    optim="paged_adamw_32bit",
    bf16=True,
    max_prompt_length=1024,
    max_length=1536,
)

## DPO Trainer Configurations

In [53]:
# Create DPO trainer
dpo_trainer = DPOTrainer(
    model,
    ref_model=None,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=0.1,
    
)



## Train the model

In [54]:
# Fine-tune model with DPO
dpo_trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,0.6931
2,0.4639
3,0.1752
4,0.2246
5,0.1359
6,0.0419
7,0.0127
8,0.0096
9,0.0684
10,0.086




TrainOutput(global_step=62, training_loss=0.03395676131920265, metrics={'train_runtime': 978.1518, 'train_samples_per_second': 1.022, 'train_steps_per_second': 0.063, 'total_flos': 0.0, 'train_loss': 0.03395676131920265, 'epoch': 0.992})

## Model Saving

In [56]:
# Save artifacts
dpo_trainer.model.save_pretrained("final_checkpoint")
tokenizer.save_pretrained("final_checkpoint")

('final_checkpoint/tokenizer_config.json',
 'final_checkpoint/special_tokens_map.json',
 'final_checkpoint/tokenizer.model',
 'final_checkpoint/added_tokens.json',
 'final_checkpoint/tokenizer.json')

In [57]:
# Flush memory
import gc
del dpo_trainer, model
gc.collect()
torch.cuda.empty_cache()

In [58]:
# Reload model in FP16 (instead of NF4)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    return_dict=True,
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]


In [60]:
from peft import PeftModel

# Merge base model with the adapter
model = PeftModel.from_pretrained(base_model, "final_checkpoint")
model = model.merge_and_unload()

In [62]:
# Save model and tokenizer
new_model = "finetuned-mixtral-model"
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('finetuned-mixtral-model/tokenizer_config.json',
 'finetuned-mixtral-model/special_tokens_map.json',
 'finetuned-mixtral-model/tokenizer.model',
 'finetuned-mixtral-model/added_tokens.json',
 'finetuned-mixtral-model/tokenizer.json')

## Model Inference

In [64]:
import transformers

# Format prompt
message = [
    {"role": "system", "content": "You are a helpful assistant chatbot."},
    {"role": "user", "content": "What is a Large Language Model?"}
]
tokenizer = AutoTokenizer.from_pretrained(new_model)
prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)

# Create pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=new_model,
    tokenizer=tokenizer
)

# Generate text
sequences = pipeline(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    num_return_sequences=1,
    max_length=200,
)
print(sequences[0]['generated_text'])

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.08it/s]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


<|im_start|>system
You are a helpful assistant chatbot.<|im_end|>
<|im_start|>user
What is a Large Language Model?<|im_end|>
<|im_start|>assistant
A Large Language Model (LLM) is a type of artificial intelligence (AI) system designed to process and generate human-like text or speech. These models are built using deep learning techniques, particularly neural networks with many layers, and are trained on massive amounts of diverse text data. The "large" in the term refers to the significant size of the model, which enables it to capture intricate patterns and relationships within the data. Large Language Models can understand and generate coherent and contextually appropriate sentences, engage in tasks such as translation, summarization, and question answering, and even create original stories or poetry. Some well-known examples of LLMs include GPT-3, BERT, and T5.
