In [None]:
# !pip install -U -q transformers==4.51.3 datasets==3.5.0 bitsandbytes==0.45.5 triton==3.2.0 unsloth==2025.3.19 torch==2.6.0 peft==0.15.2 trl==0.15.2 wandb==0.19.10

In [None]:
import os
import wandb
os.environ["WANDB_API_KEY"] = 
os.environ["WANDB_PROJECT"] = "Coursework" 
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [None]:
from huggingface_hub import login
login()

In [None]:
import unsloth
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig
from transformers import BitsAndBytesConfig

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model, tokenizer = FastLanguageModel.from_pretrained(
    "Qwen/Qwen2.5-0.5B-Instruct",
    quantization_config=bnb_config,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=42,
)

In [None]:
train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
eval_samples = load_dataset("trl-lib/ultrafeedback_binarized", split="test[:32]")

In [None]:
from trl import DPOConfig

dpo_args = DPOConfig(
        output_dir="Qwen2.5-0.5B-Instruct-DPO",
        logging_steps=40,
        max_length=1024,
        fp16=True,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        eval_strategy='steps',
        eval_steps=40,
        optim='adamw_8bit',
        learning_rate=5e-7,
        lr_scheduler_type="cosine",
        warmup_ratio=0.05,  
        beta=0.1,
        gradient_checkpointing=True,
        save_strategy="steps",
        save_steps=1000,
        push_to_hub=True,
        hub_model_id="theevolutionisnear/Qwen2.5-0.5B-Instruct-DPO",
        hub_strategy="checkpoint",
        hub_token=True,   
        report_to="wandb",
    )

In [None]:
# wandb.init(project="Coursework",
           # id="izl7baxm",
           # resume="must")
# artifact = run.use_artifact('animavestra888-independent/Coursework/model-qgdrqpv5:v7', type='model')
# artifact_dir = artifact.download()

In [None]:
# _torch_load = torch.load

# def _load_with_full_pickle(*args, **kwargs):
#     
#     kwargs["weights_only"] = False

#     return _torch_load(*args, **kwargs)
# 
# torch.load = _load_with_full_pickle 

In [None]:
from trl import DPOTrainer

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=dpo_args,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_samples,
)

trainer.train()
#trainer.train(resume_from_checkpoint=artifact_dir)