# DPO miscellaneous

### Finding max lengths in datasets

In [2]:
import json
from transformers import AutoTokenizer

def calculate_max_lengths(data_path, tokenizer):
    max_prompt_length = 0
    max_target_length = 0
    max_combined_length = 0

    with open(data_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            prompt_tokens = tokenizer.encode(data['prompt'], add_special_tokens=False)
            chosen_tokens = tokenizer.encode(data['chosen'], add_special_tokens=False)
            rejected_tokens = tokenizer.encode(data['rejected'], add_special_tokens=False)

            prompt_length = len(prompt_tokens)
            chosen_length = len(chosen_tokens)
            rejected_length = len(rejected_tokens)

            max_prompt_length = max(max_prompt_length, prompt_length)
            max_target_length = max(max_target_length, chosen_length, rejected_length)
            max_combined_length = max(max_combined_length, prompt_length + chosen_length, prompt_length + rejected_length)

    return max_prompt_length, max_target_length, max_combined_length

In [4]:

# Paths to your datasets
train_data_path = 'datasets/DPO_train.jsonl'
vali_data_path = 'datasets/DPO_eval.jsonl'

# Initialize the tokenizer
model_name = 'microsoft/Phi-3-mini-4k-instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Calculate max lengths for training data
train_max_prompt_length, train_max_target_length, train_max_combined_length = calculate_max_lengths(train_data_path, tokenizer)
print(f'Training Data - Max Prompt Length: {train_max_prompt_length}, Max Target Length: {train_max_target_length}, Max Combined Length: {train_max_combined_length}')

# Calculate max lengths for validation data
vali_max_prompt_length, vali_max_target_length, vali_max_combined_length = calculate_max_lengths(vali_data_path, tokenizer)
print(f'Validation Data - Max Prompt Length: {vali_max_prompt_length}, Max Target Length: {vali_max_target_length}, Max Combined Length: {vali_max_combined_length}')

# Determine overall max lengths
overall_max_prompt_length = max(train_max_prompt_length, vali_max_prompt_length)
overall_max_target_length = max(train_max_target_length, vali_max_target_length)
overall_max_combined_length = max(train_max_combined_length, vali_max_combined_length)

print(f'Overall - Max Prompt Length: {overall_max_prompt_length}, Max Target Length: {overall_max_target_length}, Max Combined Length: {overall_max_combined_length}')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Training Data - Max Prompt Length: 528, Max Target Length: 1351, Max Combined Length: 1647
Validation Data - Max Prompt Length: 474, Max Target Length: 991, Max Combined Length: 1302
Overall - Max Prompt Length: 528, Max Target Length: 1351, Max Combined Length: 1647


Results:

Training Data - Max Prompt Length: 528, Max Target Length: 1351, Max Combined Length: 1647

Validation Data - Max Prompt Length: 474, Max Target Length: 991, Max Combined Length: 1302

Overall - Max Prompt Length: 528, Max Target Length: 1351, Max Combined Length: 1647

### GPT-neo: DPO

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_name = "EleutherAI/gpt-neo-125m"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

## Archive

### PHI 3: DPO and LORA

#### Setup

In [None]:
import json
import torch
import wandb
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, PeftType
from trl import DPOTrainer, DPOConfig
from sklearn.metrics import accuracy_score, roc_auc_score

: 

In [6]:
model_name = 'microsoft/Phi-3-mini-4k-instruct'
train_data_path = 'datasets/DPO_train.jsonl'
vali_data_path = 'datasets/DPO_eval.jsonl'

# Load train and eval datasets
train_path = 'datasets/DPO_train.jsonl'
eval_path = 'datasets/DPO_eval.jsonl'
dataset = load_dataset('json', data_files={"train": train_path, "evaluation": eval_path})

In [7]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.23it/s]


#### Hyperparams

In [9]:
# Play with LORA configuration: start from hyperparams seen in literature, try different values later
lora_config = LoraConfig(
    r= 32, # attention dimension; default is 8, try higher for more precision
    lora_alpha=16, # decrease it if we see unstable
    target_modules='all-linear', # try with all-linear for more radical changes 
    lora_dropout=0.01, #same as original LORA paper
    bias="none",
    task_type="CAUSAL_LM",
)

lora_model = get_peft_model(model, lora_config)
wandb.init(project="dpo_lora")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33magatha-duzan[0m ([33magatha-duzan-EPFL[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
dpo_config = DPOConfig(
    beta=0.1,
    label_smoothing=0,
    loss_type="sigmoid",
    precompute_ref_log_probs=True,
    max_length=1647,
    max_prompt_length=528,
    max_target_length=1351, # calculated in dpo_debug
    disable_dropout=True,
    generate_during_eval=False,
    truncation_mode="keep_end",
    output_dir='checkpoints/dpo_lora',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to='wandb',
)

trainer = DPOTrainer(
    model=lora_model,
    ref_model=None,
    args=dpo_config,
    train_dataset=dataset['train'],
    eval_dataset=dataset['evaluation'],
    tokenizer=tokenizer,
    data_collator=None,
    optimizers=(None, None)
)