In [None]:
# Load the dataset.

import datasets # type: ignore

# tmds = datasets.load_dataset("ZSvedic/gpt4o-arena-brevity-dpo")
# dataset = tmds['train']
# eval_dataset = tmds['test']
dataset = eval_dataset =datasets.load_dataset('jondurbin/truthy-dpo-v0.1', split='train')
# dataset = datasets.load_dataset('ZSvedic/phi3-arena-short-dpo', split='train')
# eval_dataset = datasets.load_dataset('ZSvedic/phi3-arena-short-dpo', split='test')
print(f"Train dataset: {len(dataset)}, Test dataset: {len(eval_dataset)}")

In [None]:
# Load the model and corresponding tokenizer.
import torch # type: ignore

import sys
sys.path.append('..')  # Add the parent directory to the Python path
import utils.llm_utils as llm

# model_name = 'microsoft/Phi-3-mini-4k-instruct'
model_name = 'microsoft/phi-2'
# model_name = 'Manoj21k/microsoft-phi-2-finetuned'
tokenizer, model = llm.load_tokenizer_and_model(model_name)
# From: https://github.com/jndiogo/LLM-chat-templates?tab=readme-ov-file#phi-2
# tokenizer.chat_template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() + '\n\n' %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ 'Instruct: ' + content.strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'Output: '  + content.strip() + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Output:' }}{% endif %}"

# I don't think this is needed, but everbody sets ref_model.
from transformers import AutoModelForCausalLM 

ref_model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        torch_dtype=torch.float16, 
        device_map="auto", 
        trust_remote_code=True,
        # Required for packing: https://huggingface.co/blog/packing-with-FA2
        attn_implementation="flash_attention_2"
        )

print(f'Allocated GPU memory: {torch.cuda.memory_allocated() / (1024*1024):,.1f} MB')

In [None]:
# ZEL: Test the model verbosity before fine-tuning.

def calc_model_avg_len(tokenizer, model):

    questions = [
        "How much is 2+3?",
        "What is the color of the sky?",
        "What is the capital of France?",
        "What is the boiling point of water?",
        "Who wrote 'To Kill a Mockingbird'?",
        "What is the largest planet in our solar system?",
        "What is the chemical symbol for gold?",
        "How many continents are there?",
        "What is the speed of light?",
        "Who painted the Mona Lisa?",
        "What is the smallest prime number?",
        "What is the main ingredient in guacamole?",
        "What is the square root of 64?",
        "What is the currency of Japan?",
        "Who discovered penicillin?",
        "What is the tallest mountain in the world?",
        "What is the primary language spoken in Brazil?",
        "What is the freezing point of water?",
        "What is the largest mammal?",
        "What is the capital of Japan?"
    ]

    # messages = [ [{"role": "user", "content": q}] for q in questions]
    # prompts = [tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
    #         for m in messages]
    prompts = [f"Instruct: {q}\nOutput: " for q in questions]

    inputs = tokenizer(prompts, return_tensors='pt', padding=True).to('cuda')
    inputs_tok_len = inputs["input_ids"].shape[1]

    results = model.generate(**inputs, max_new_tokens = 200, use_cache = True)
    sequences = tokenizer.batch_decode(results[:, inputs_tok_len:], skip_special_tokens=True)

    averages = []
    for p, answer in zip(prompts, sequences):
        print(f"PROMPT: {p}\nANSWER: {answer}")
        print('------------------------------------------------------------------------------------')
        averages.append(len(answer))

    total_average = sum(averages)/len(averages)
    print(f"Average {model.name_or_path} answer length for {len(averages)} questions: {total_average:.2f} characters")

    return total_average

# Then run the verbosity test:
model.eval()
avg_len_before = calc_model_avg_len(tokenizer, model)
model.train()
print(avg_len_before)

In [4]:
# ZEL: define the len_metrics function.

def len_metrics(pred):
    model.eval()
    avg_len = calc_model_avg_len(tokenizer, model)
    model.train()
    return {"avg_len": "avg_len"}


In [None]:
model.train()
ref_model.eval()

# Configure DPOTrainer.

from trl import DPOTrainer, DPOConfig

# Set up the training arguments
training_args = DPOConfig(
    output_dir="../results",
    logging_dir="../logs",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    max_prompt_length = 108, # 60 words x 6 ch x 1.2 buffer / 4 chars_in_token
    max_length = 324, # (60+120) words x 6 ch x 1.2 buffer / 4 chars_in_token
    remove_unused_columns=False,
    learning_rate=1e-6,
    # gradient_accumulation_steps=8,
    # gradient_checkpointing=True,
    logging_steps=1,
    eval_strategy="steps",
    eval_steps=1,
)

trainer = DPOTrainer(
    model=model,
    ref_model=ref_model, # Not sure if this is needed?
    beta=0.1,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    compute_metrics=len_metrics,
    )

# For debugging purposes, save the initial model before training.
trainer.save_model("../results/base-model")

In [None]:
trainer.train()

In [None]:
# ZEL: Run the verbosity test after fine-tuning:
model.eval()
avg_len_after = calc_model_avg_len(tokenizer, model)

print(f"AVG length before was {avg_len_before}, after fine-tuning it is {avg_len_after}.")

In [5]:
# Save.
trainer.save_model(training_args.output_dir)