# DPO on News Memorization

This file is to run direct policy optimization (DPO) on the news memorization dataset for the purpose of memorization and forgetting.

In [1]:
import dataclasses
import torch
from transformers import BitsAndBytesConfig, TrainingArguments
from datasets import Dataset
import pandas as pd


from trl import DPOConfig, DPOTrainer
from peft import PeftModel

from matplotlib import pyplot as plt
from tqdm import tqdm

In [2]:
# setup the huggingface key
import json
with open('../apikeys.json', 'r') as f:
    apikeys = json.load(f)
hf_key = apikeys['hf_api_key']

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

## Setup Config for DPO Training

Before the formal training, we need to setup the config for the DPO training.

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig

In [5]:
# DPO General Configs
dpo_model_name = "meta-llama/Meta-Llama-3-8B"
dpo_data_dir = "../datasets/latest_news/latest_news_memorization.csv"
dpo_output_dir = "./dpo_models/latest_news_memorization"
dpo_log_dir = "./dpo_logs/latest_news_memorization"
sft_adapter_dir = "./sft_models/latest_news_memorization"

In [6]:
# bnb Configs
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
training_args = DPOConfig(output_dir=dpo_output_dir, 
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    logging_steps=10,
    learning_rate=2e-4,
    eval_strategy="no",
    eval_steps=10,
    bf16=True,
    lr_scheduler_type='cosine',
    warmup_ratio=0.05,  # warmup ratio
    save_steps=100,
    save_total_limit=2,
    output_dir=dpo_output_dir,
    logging_dir=dpo_log_dir,
)

generate_max_length = 512
tokenizer_max_length = 512