In [22]:
import os
import gc
import torch
import re

import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import DPOTrainer
import bitsandbytes as bnb
import wandb

# Defined in the secrets tab in Google Colab
wb_token = 'd62e774d3a369544825b73ac6c0e2a8dfcd11cb9'
wandb.login(key=wb_token)

model_name = "openchat/openchat-3.5-0106"
new_model = "Mandy"

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/tiny/.netrc


In [23]:
# same as format_as_chat(), but specially handles entries with multiple turns
def format_as_chat(example):
    prompt = example['prompt']
    messages = parse_conversation(prompt)

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # in the Nectar dataset, each example has an 'answers' list of at most 7 answers
    # each 'answer' has 'answer', 'model', and 'rank'
    # 'answer' is the text, 'model' is the model that generated that text, and 'rank' is how gpt4-turbo ranks the answer
    # since DPO simply wants a "chosen" and "rejected", we will pick the top ranked answer as "chosen",
    # and the 3rd ranking answer as "rejected". We skip the 2nd to improve the signal between "good" and "bad" (in theory).
    chosen = find_entry_with_rank(example['answers'], 1)

    if len(example['answers']) > 2:
        rejected = find_entry_with_rank(example['answers'], 3)
    elif len(example['answers'] > 1):
        rejected = find_entry_with_rank(example['answers'], 2)
    else:
        assert(False, 'Expected at least a rank 2 or 3 to exist')

    assert(chosen['rank'] == 1)
    assert(rejected['rank'] > 1)

    return {
        "prompt": prompt,
        "chosen": chosen['answer'] + eos_token_text,
        # question: should the rejected answer exclude the eos token, since we *do* was to highly value the eos token?
        "rejected": rejected['answer'] + eos_token_text,
    }

def parse_conversation(text):
    # Regular expression pattern to match '\n\nHuman:' or '\n\nAssistant:' followed by any text
    pattern = r"\n\n(Human|Assistant): (.*?)(?=\n\n(Human|Assistant): |\Z)"
    
    # Find all matches using the regular expression
    matches = re.findall(pattern, text, re.DOTALL)

    # Initialize an empty list to store the parsed data
    parsed_data = []

    # Iterate over the matches and create a dictionary for each
    for role, content, _ in matches:
        if role == 'Human':
            parsed_data.append({"role": "user", "content": content})
        elif role == 'Assistant':
            parsed_data.append({"role": "assistant", "content": content})

    assert(parsed_data[-1]["role"] == "assistant")
    assert(len(parsed_data[-1]["content"]) == 0)

    parsed_data.pop() # remove final, empty assistant response - it will be added back in the chat jinja turn formatting
    
    return parsed_data

def find_entry_with_rank(entries, rank):
    """Given a list of elements and a target rank, return the first element with that rank."""
    return next((elem for elem in entries if elem['rank'] == rank), None)

# Load dataset
dataset = load_dataset("berkeley-nest/Nectar")['train']

# Save columns
original_columns = dataset.column_names

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token_text = tokenizer.eos_token
print(f'eos token: {eos_token_text}')

# Format dataset
# weird: some entries don't have an answer with rank 1?
dataset = dataset.filter(lambda x: any([e for e in x['answers'] if e['rank'] == 1]))
dataset = dataset.map(
    format_as_chat,
    remove_columns=original_columns
)

# Print sample
dataset[0]

  assert(False, 'Expected at least a rank 2 or 3 to exist')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


eos token: <|end_of_turn|>


Map: 100%|██████████| 182953/182953 [00:31<00:00, 5872.33 examples/s]


{'prompt': '<s>GPT4 Correct User: 0.002 = 1000 \n1 = x?<|end_of_turn|>GPT4 Correct Assistant:',
 'chosen': 'To find the value of x, we can set up a proportion using the given information:\n\n0.002/1000 = 1/x\n\nTo solve for x, we can cross multiply:\n\n0.002 * x = 1000 * 1\n\n0.002x = 1000\n\nDividing both sides by 0.002:\n\nx = 1000 / 0.002\n\nx = 500,000\n\nTherefore, 1 is equal to 500,000 in this proportion.<|end_of_turn|>',
 'rejected': 'This looks like a proportion. To solve for x, you can set up a proportion equation:\n\n0.002 / 1 = 1000 / x\n\nNow, cross-multiply:\n\n0.002 \\* x = 1 \\* 1000\n\nThen, solve for x:\n\n0.002x = 1000\n\nx = 1000 / 0.002\n\nx = 500,000<|end_of_turn|>'}

In [24]:
# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# Model to fine-tune
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    load_in_4bit=True
)
model.config.use_cache = False

# Reference model
ref_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    load_in_4bit=True
)

# Training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=200,
    save_strategy="no",
    logging_steps=1,
    output_dir=new_model,
    optim="paged_adamw_32bit",
    warmup_steps=100,
    bf16=True,
    report_to="wandb",
)

# Create DPO trainer
dpo_trainer = DPOTrainer(
    model,
    ref_model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=0.1,
    max_prompt_length=1024,
    max_length=1536,
)

# Fine-tune model with DPO
dpo_trainer.train()

config.json: 100%|██████████| 651/651 [00:00<00:00, 7.52MB/s]
model.safetensors.index.json: 100%|██████████| 23.9k/23.9k [00:00<00:00, 66.9MB/s]
model-00001-of-00003.safetensors: 100%|██████████| 4.94G/4.94G [00:44<00:00, 110MB/s]
model-00002-of-00003.safetensors: 100%|██████████| 5.00G/5.00G [00:44<00:00, 112MB/s]
model-00003-of-00003.safetensors: 100%|██████████| 4.54G/4.54G [00:39<00:00, 115MB/s]
Downloading shards: 100%|██████████| 3/3 [02:09<00:00, 43.20s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.47s/it]
generation_config.json: 100%|██████████| 179/179 [00:00<00:00, 2.05MB/s]
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.29s/it]
Map: 100%|██████████| 182953/182953 [08:11<00:00, 372.05 examples/s] 
[34m[1mwandb[0m: Currently logged in as: [33mandysalerno[0m. Use [1m`wandb login --relogin`[0m to force relogin


Could not estimate the number of tokens of the input, floating-point operations will not be computed


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.34 GiB. GPU 0 has a total capacty of 23.69 GiB of which 1.04 GiB is free. Including non-PyTorch memory, this process has 22.60 GiB memory in use. Of the allocated memory 19.55 GiB is allocated by PyTorch, and 2.74 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Save artifacts
dpo_trainer.model.save_pretrained("final_checkpoint")
tokenizer.save_pretrained("final_checkpoint")

# Flush memory
del dpo_trainer, model, ref_model
gc.collect()
torch.cuda.empty_cache()

# Reload model in FP16 (instead of NF4)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    return_dict=True,
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Merge base model with the adapter
model = PeftModel.from_pretrained(base_model, "final_checkpoint")
model = model.merge_and_unload()

# Save model and tokenizer
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)