# Assignment 4

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '2,3'

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset, Dataset
import pandas as pd
import random
from trl import DPOConfig, DPOTrainer
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

from tqdm import tqdm
import json
random.seed(3407)
torch.manual_seed(3407)



<torch._C.Generator at 0x7fc0e81c1ed0>

In [None]:
# Huggingface login token
from getpass import getpass
os.environ["HF_TOKEN"] = getpass("Enter your Hugging Face token: ")


In [2]:
# Load the model and set the device

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# Set up the special tokens
EOS_token = tokenizer.eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.unk_token

# Setting up lora
# Note: According to W_total = W_base + ΔW_lora, when lora applied, ΔW_lora ≈ 0, so W_total ≈ W_base, would not affect the model performance.

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,  # LoRA rank
    lora_alpha=32,  # LoRA scaling
    lora_dropout=0.1,  # Dropout probability
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.0470


## Load LIMA dataset and sampling instructions

In [2]:
# Load the dataset, randomly sample 50 instructions, and filter out multi-turn conversations
def is_single_turn(example):
    return len(example["conversations"]) == 2

lima_dataset = load_dataset("GAIR/LIMA", split="train", trust_remote_code=True).shuffle(seed=3407)
sample_size = 50 # Number of instructions to sample
instructions = []

with tqdm(total=sample_size, desc="Sampling") as pbar:
    for example in lima_dataset:
        if not is_single_turn(example):
            continue
        instructions.append(example["conversations"][0])
        pbar.update(1)
        if len(instructions) >= sample_size:
            break

Sampling: 100%|██████████| 50/50 [00:00<00:00, 7238.55it/s]


## Generate 5 responses for each insturction

In [None]:
def generate_responses(model, instruction, num_responses=1):
    messages = [
        {"role": "user", "content": instruction}
    ]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
    input_length = inputs.shape[1]  # Get the input length
    outputs = model.generate(
        inputs,
        max_new_tokens=256,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        num_return_sequences=num_responses,
        pad_token_id=tokenizer.unk_token_id,
    )
    # Remove the input tokens from the output
    responses = [tokenizer.decode(output[input_length:], skip_special_tokens=True).strip() for output in outputs]
    return responses

In [None]:
# Generate 5 responses for each of the sampled instructions
print("Generating responses...")

all_data = []
for instruction in instructions:
    responses = generate_responses(model, instruction, num_responses=5)
    all_data.append({"instruction": instruction, "responses": responses})
    print(f"Instruction: {instruction}")
    print(f"Responses: {responses}")


Generating responses...
Instruction: What are the major branches of philosophy?
Responses: ['Philosophy is a broad and multifaceted discipline that explores fundamental questions about reality, knowledge, values, and the nature of existence. Over the centuries, philosophers have developed various branches or subfields of philosophy to systematically investigate specific aspects of these fundamental questions. Here are some of the major branches of philosophy:\n\n1. Metaphysics: Metaphysics is the branch of philosophy that deals with the nature of reality, including the relationship between mind and body, substance and accident, and the ultimate nature of being.\n2. Epistemology: Epistemology is the branch of philosophy that deals with knowledge and belief. It explores questions about the nature and sources of knowledge, the limits of knowledge, and the relationship between truth and belief.\n3. Logic: Logic is the branch of philosophy that deals with the principles of correct reasoning

In [6]:
# Save the generated data to a JSON file
output_file = "lima_responses.json"
if os.path.exists(output_file):
    #delete the existing file
    os.remove(output_file)
with open(output_file, "w") as f:
    json.dump(all_data, f, indent=4)


## Use PairRM to create a preference dataset

In [6]:
# Load the generated data from the JSON file
output_file = "lima_responses.json"
with open(output_file, "r") as f:
    all_data = json.load(f)

In [None]:
# Use PairRM to create pairwise preference data
import llm_blender
import pandas as pd

# Load the LLM-Blender model for ranking
blender = llm_blender.Blender()
blender.loadranker("llm-blender/PairRM")

# Assuming all_data is a list of dictionaries with the structure:
# all_data = [{"instruction": ..., "responses": [...]}, ...]

# Extract inputs and candidates
inputs = [item["instruction"] for item in all_data]
candidates_texts = [item["responses"] for item in all_data]

# Rank each group of candidates, returning ranks[i][j] where ranks[i][j] indicates the rank of the j-th response for the i-th input (1 being the best)
ranks = blender.rank(inputs, candidates_texts, return_scores=False, batch_size=5)

# Construct Pairwise Preference Data
pairwise_data = []
for i, item in enumerate(all_data):
    instruction = item["instruction"]
    responses = item["responses"]
    response_ranks = ranks[i]

    # Sort indices in ascending order of rank, where rank 1 is the best
    sorted_indices = sorted(range(len(response_ranks)), key=lambda j: response_ranks[j])
    top_index = sorted_indices[0]  # Top-1 的索引

    # Pair the top one with the others
    # Create pairs of (chosen, rejected) for the top response and the others
    for j in sorted_indices[1:]:
        pairwise_data.append({
            "prompt": instruction,
            "chosen": responses[top_index],
            "rejected": responses[j]
        })

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(pd.DataFrame(pairwise_data))
# Save the dataset to a local JSONL file
jsonl_file_path = "pairrm_dataset.jsonl"
print(f"Saving dataset to local JSONL file: {jsonl_file_path}")
dataset.to_json(jsonl_file_path, orient="records", lines=True, force_ascii=False)
print(f"Dataset successfully saved to {jsonl_file_path}")



Successfully loaded ranker from  /home/keye/.cache/huggingface/hub/llm-blender/PairRM


Ranking candidates: 100%|██████████| 10/10 [00:24<00:00,  2.40s/it]


In [5]:
print(f"Pairwise data sample: {dataset[0]}") # Check the first sample of the dataset



In [None]:
# 将数据集推送至HuggingFace Hub

# push_to_hub 会以 Apache Arrow/Parquet 格式上传数据集，这是 Hugging Face Hub 的标准格式
dataset_repo_id = "YipKo/dsaa6000q_assignment4_pairrm-dataset"
print(f"Pushing dataset to Hugging Face Hub repository: {dataset_repo_id} (standard Arrow/Parquet format)")
dataset.push_to_hub(dataset_repo_id)
print("Standard dataset push completed.")

from huggingface_hub import HfApi

jsonl_file_path = "pairrm_dataset.jsonl"
api = HfApi()
target_path_in_repo = "data.jsonl"  # 在 Hub 仓库中的文件名

print(f"Uploading {jsonl_file_path} to {dataset_repo_id} as {target_path_in_repo}...")
api.upload_file(
    path_or_fileobj=jsonl_file_path,
    path_in_repo=target_path_in_repo,
    repo_id=dataset_repo_id,
    repo_type="dataset" # 指定仓库类型为 dataset
)
print(f"Successfully uploaded {jsonl_file_path} to {dataset_repo_id} as {target_path_in_repo}")


In [3]:
# Load the dataset from the JSONL file
dataset = load_dataset('json', data_files="pairrm_dataset.jsonl", split='train')


## Start DPO Fine-tuning

In [4]:
# DPO 微调
output_dir = "./DPO-Mistral-7B"

# 创建DPOTrainer
dpo_config = DPOConfig(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2, # Although batch=1, due to Pairwise, it still forward twice, so to save memory, set gradient_accumulation_steps=2
    num_train_epochs=3, # 3 is enough for 50 instructions, actually from the second epoch, the loss is already very low and stable
    learning_rate=2e-5,
    beta=0.1,
    push_to_hub=False,
    logging_steps=25,
    fp16=True,
    logging_dir=os.path.join(output_dir, "logs"),
    max_prompt_length=256,
    max_length=512,
    save_strategy="epoch",
)
dpo_trainer = DPOTrainer(
    model=model,
    args=dpo_config,
    train_dataset=dataset,
    processing_class=tokenizer
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [5]:
dpo_trainer.train()
output_dir = "./DPO-Mistral-7B"

# 保存微调后的PEFT adapter模型
model.save_pretrained(output_dir)
# 保存tokenizer
tokenizer.save_pretrained(output_dir)


Step,Training Loss
25,0.6461
50,0.5389
75,0.3819
100,0.2685
125,0.0877
150,0.0927
175,0.0596
200,0.0392
225,0.0251
250,0.0098


('./DPO-Mistral-7B/tokenizer_config.json',
 './DPO-Mistral-7B/special_tokens_map.json',
 './DPO-Mistral-7B/tokenizer.model',
 './DPO-Mistral-7B/added_tokens.json',
 './DPO-Mistral-7B/tokenizer.json')

In [8]:
# Push and commit the trained PEFT adapter to HuggingFace
from huggingface_hub import HfApi
api = HfApi()
output_dir = "./DPO-Mistral-7B"
api.upload_folder(
    folder_path=output_dir,
    path_in_repo="/",
    repo_id="YipKo/dsaa6000q_assignment4_dpo_adapter",
    repo_type="model"
)

adapter_model.safetensors:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/27.4M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

Upload 23 LFS files:   0%|          | 0/23 [00:00<?, ?it/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/6.14k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/27.4M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/6.14k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/27.4M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/6.14k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/YipKo/dsaa6000q_assignment4_dpo_adapter/commit/b30508f8506f7d5e1a3595094f0fccea67b7b6d8', commit_message='Upload folder using huggingface_hub', commit_description='', oid='b30508f8506f7d5e1a3595094f0fccea67b7b6d8', pr_url=None, repo_url=RepoUrl('https://hf-mirror.com/YipKo/dsaa6000q_assignment4_dpo_adapter', endpoint='https://hf-mirror.com', repo_type='model', repo_id='YipKo/dsaa6000q_assignment4_dpo_adapter'), pr_revision=None, pr_num=None)

## Compare fine tuned model and original model's output

In [None]:
#  sample 10 instructions that were not seen in training
left_instructions = list(set([example["conversations"][0] for example in lima_dataset]) - set(instructions))
test_instructions = random.sample(left_instructions, 10)


In [None]:
# Load the original model and DPO model 
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", trust_remote_code=True)
original_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", torch_dtype=torch.bfloat16, device_map="auto")

dpo_model = PeftModel.from_pretrained(model=original_model, model_id="./DPO-Mistral-7B", is_trainable=False, torch_dtype="auto", device_map="auto")


In [None]:
# Define a function to compare the original and DPO models
def compare_models(instruction):
    orig_response = generate_responses(original_model, instruction, num_responses=1)[0]
    dpo_response = generate_responses(dpo_model, instruction, num_responses=1)[0]
    return orig_response, dpo_response

In [16]:
# Compare the completions from the original model (mistralai/Mistral-7B-Instruct-v0.2 and DPO fine-tuned model
results = []
with tqdm(test_instructions, desc="Comparing models") as pbar:
    for instr in pbar:
        orig, dpo = compare_models(instr)
        results.append({"Instruction": instr, "Original": orig, "DPO": dpo})

# Convert to pandas DataFrame and print out the dataframe to stdout
df = pd.DataFrame(results)
print(df)


Comparing models: 100%|██████████| 10/10 [04:14<00:00, 25.48s/it]

                                         Instruction  \
0  Given that Kohn-Sham DFT is strictly a ground-...   
1  The Batman dies. As a joke, (or possibly in ho...   
2  Given N jobs where every job is represented by...   
3  my dog Cannibal passed away last nigh, these a...   
4  How to add margin top to ```class="row"``` ele...   
5  I had an interview with an employer working on...   
6  I want to buy a used car in Santa Clara. Shoul...   
7  How to know if your girlfriend is ready to kis...   
8  I’m writing a short alternative history story ...   
9  Not sure if this is a Mozilla-specific JS synt...   

                                            Original  \
0  While Kohn-Sham Density Functional Theory (DFT...   
1  Ladies and Gentlemen, esteemed guests, fellow ...   
2  Here's the Python code to find the maximum pro...   
3  I'm so sorry for your loss. I'd be happy to he...   
4  To add a margin-top to `.row` elements in Twit...   
5  To approach a potential customer, specifical




## Link to dataset and Peft Adapter

PairRM Dataset: https://huggingface.co/datasets/YipKo/dsaa6000q_assignment4_pairrm-dataset

DPO Peft Adapter: https://huggingface.co/YipKo/dsaa6000q_assignment4_dpo_adapter
