In [1]:
import torch
import sys
sys.path.append('..')
from model.utils import LMHyperParams, SmModel, ModelChoice
from dataset.squad import UltraFeedbackDataModule
from transformers import AutoTokenizer, PreTrainedTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft.tuners.lora.config import LoraConfig
from transformers import TrainingArguments
from trl import DPOTrainer, DPOConfig
from typing import cast
from peft.peft_model import PeftModel
import gc
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id = "HuggingFaceTB/SmolLM2-135M-Instruct" # replace with your model id

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(model_id) # type: ignore
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left' # to prevent errors with FA
tokenizer.truncation_side = 'left' # to prevent cutting off last generation

In [3]:
data_module = UltraFeedbackDataModule(2, tokenizer, 1024, 10, False)
# debugger will fail without this
data_module.num_workers = 1
data_module.setup("fit")

[32m2024-11-24 01:46:28.875[0m | [1mINFO    [0m | [36mdataset.squad[0m:[36msetup[0m:[36m220[0m - [1mLoading dataset for stage fit[0m


[32m2024-11-24 01:46:30.488[0m | [1mINFO    [0m | [36mdataset.squad[0m:[36msetup[0m:[36m231[0m - [1mProcessing dataset for stage fit, workers: 1, cache dir dataset_caches/ultrafeedback[0m
Map: 100%|██████████| 9/9 [00:00<00:00, 488.55 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 129.03 examples/s]


In [10]:
gc.collect()
torch.cuda.empty_cache()

In [5]:
# max_prompt_length is the maximum length of the prompt and the max_length is the maximum length of the prompt + chosen or rejected response
prompt_length = 1024
max_seq_length = 1512

peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.05,
    r=256,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)

args = DPOConfig(
    output_dir="doplhin-dpo",
    num_train_epochs=1,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    learning_rate=5e-5,
    max_grad_norm=0.3,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=25,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=700,
    bf16=True,
    tf32=True,
    push_to_hub=False,
    report_to="tensorboard",
    # debugger will fail without this
    dataloader_num_workers=1,
    dataset_num_proc=1,
    max_length=max_seq_length,
    max_prompt_length=prompt_length,
    precompute_ref_log_probs=True,
    beta=0.1,
    loss_type="sigmoid",
)


trainer = DPOTrainer(
    model,
    ref_model=None,  # set to none since we use peft
    peft_config=peft_config,
    args=args,
    train_dataset=data_module.train_dataset,
    eval_dataset=data_module.val_dataset,
    tokenizer=tokenizer,  # type: ignore
)

Extracting prompt from train dataset: 100%|██████████| 9/9 [00:00<00:00, 315.41 examples/s]
Applying chat template to train dataset: 100%|██████████| 9/9 [00:00<00:00, 744.87 examples/s]
Extracting prompt from eval dataset: 100%|██████████| 1/1 [00:00<00:00, 126.57 examples/s]
Applying chat template to eval dataset: 100%|██████████| 1/1 [00:00<00:00, 110.80 examples/s]
Tokenizing train dataset: 100%|██████████| 9/9 [00:00<00:00, 163.50 examples/s]
Tokenizing eval dataset: 100%|██████████| 1/1 [00:00<00:00, 90.89 examples/s]


In [8]:
loader = trainer.get_train_dataloader()
first_batch = next(iter(loader))
print(first_batch.keys())

# Make sure not to create a computation graph, or the model will OOM
with torch.no_grad():
    loss, metrics = trainer.compute_loss(model, first_batch, True)
    print(loss)
    display(metrics)

  prompt_input_ids = [torch.tensor(example["prompt_input_ids"]) for example in examples]
  chosen_input_ids = [torch.tensor(example["chosen_input_ids"]) for example in examples]
  rejected_input_ids = [torch.tensor(example["rejected_input_ids"]) for example in examples]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


dict_keys(['prompt_input_ids', 'prompt_attention_mask', 'chosen_input_ids', 'chosen_attention_mask', 'rejected_input_ids', 'rejected_attention_mask'])
tensor(0.6931, device='cuda:0')


{'rewards/chosen': tensor(0.),
 'rewards/rejected': tensor(0.),
 'rewards/accuracies': tensor(0.),
 'rewards/margins': tensor(0.),
 'logps/chosen': tensor(-868.0268),
 'logps/rejected': tensor(-542.8120),
 'logits/chosen': tensor(5.1562, dtype=torch.float16),
 'logits/rejected': tensor(4.8672, dtype=torch.float16)}

In [None]:
trainer.train()