# vLLM Speed UP Inference

In [None]:
from vllm import LLM, SamplingParams
max_model_len, tp_size = 4096, 1
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import numpy as np

def get_response_prompts(instructions):
    positive_prompts = []
    for instruction in instructions:
        messages = [
            {"role": "system", "content": "You are a helpful AI assistant."},
            {"role": "user", "content": instruction}
        ]
        positive_prompts.append(messages)
    return positive_prompts

def get_worse_response_question_prompts(instructions, positive_responses, worse_response_prompt_template):
    worse_prompts = []
    for instruction, response in zip(instructions, positive_responses):
        messages = [
            {"role": "system", "content": "You are a helpful AI assistant."},
            {"role": "user", "content": worse_response_prompt_template.format(input=instruction, generation=response)}
        ]
        worse_prompts.append(messages)
    return worse_prompts

def extract_negative_questions(instructions, positive_responses, inputs):
    all_negative_instructions = []
    temp_instructions = []
    temp_positive_responses = []
    for idx, item in enumerate(inputs):
        try:
            start = item.index("[The start of User Question]")+len("[The start of User Question] ")
            if("[The end of User Question]" in item):
                end = item.index("[The end of User Question]")
            else:
                end = len(item)
            question = item[start:end]
            all_negative_instructions.append(question)
            temp_instructions.append(instructions[idx])
            temp_positive_responses.append(positive_responses[idx])
        except:
            print("Generation String Not Found! ID:", idx)
            # print(negative)
    return temp_instructions, temp_positive_responses, all_negative_instructions

def extract_negative_responses(instructions, positive_responses, generated_text):
    all_responses = []
    for idx, (instruction, positive, negative) in enumerate(zip(instructions, positive_responses, generated_text)):
        try:
            start = negative.index("[The start of Modified Instruction Response]")+len("[The start of Modified Instruction Response] ")
            if("[The end of Modified Instruction Response]" in negative):
                end = negative.index("[The end of Modified Instruction Response]")
            else:
                end = len(negative)
            negative = negative[start:end]
            all_responses.append({"instruction": instruction, "positive": positive, "negative": negative})
        except:
            print("Generation String Not Found! ID:", idx)
            # print(negative)
    return all_responses

def create_judge_response_prompts(instructions, positives, negatives, judging_template):
    judge_prompts = []
    labels = []
    for instruction, positive, negative in zip(instructions, positives, negatives):
        rand_num = np.random.rand()
        if(rand_num < 0.5):
            messages = [
                {"role": "system", "content": "You are a helpful AI assistant."},
                {"role": "user", "content": judging_template.format(input=instruction, 
                                                                    generation=positive, 
                                                                    generation2=negative)},
            ]
            judge_prompts.append(messages)
            labels.append(1)
        else:
            messages = [
                {"role": "system", "content": "You are a helpful AI assistant."},
                {"role": "user", "content": judging_template.format(input=instruction, 
                                                                    generation=negative, 
                                                                    generation2=positive)},
            ]
            judge_prompts.append(messages)
            labels.append(0)
    return judge_prompts, labels

def load_textfile_as_string(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()



In [None]:


torch.random.manual_seed(0)

model_name = "microsoft/Phi-3.5-mini-instruct"
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=2000)
llm = LLM(model=model_name, tensor_parallel_size=tp_size, max_model_len=max_model_len, 
          trust_remote_code=True, enforce_eager=True)


In [None]:
from datasets import load_dataset
from tqdm import tqdm
import pickle

#dataset is a subset of WildChat
dataset = load_dataset("facebook/Self-taught-evaluator-DPO-data")
WildChat = load_dataset("allenai/WildChat-1M")

In [None]:


# get hashes
#load the hash_id2content dictionary
# with open("hash_id2content.pkl", "rb") as f:
#   hash_id2content = pickle.load(f)
hash_id2content = dict()
for ex in tqdm(WildChat["train"], total=len(WildChat["train"])):
  turn = ex["turn"]
  hash_id2content[ex["conversation_hash"]] = ex["conversation"][2 * (turn - 1)]["content"]


print("Starting 2.")
train_data = []
for ex in tqdm(dataset["train"], total=len(dataset["train"])):
  if ex["instruction"] not in hash_id2content:
    continue
  else:
    ex["src"] = ex["src"].replace(ex["instruction"], hash_id2content[ex["instruction"]])
    train_data.append(ex)

print("Starting 3.")
# Extract Instructions
skip = 6 # I found that the dataset instructions has the same instruction repeated 6 times? Not sure why. 
all_instructions = []
num_responses = len(dataset['train'])//skip
for i in tqdm(range(0, num_responses*skip, skip), total=num_responses):
    try:
        instruction_example = hash_id2content[dataset['train'][i]['instruction']]
        all_instructions.append(instruction_example)
    except:
        continue
print(all_instructions[0])

In [None]:
sys_str = "<|start_header_id|>system<|end_header_id|>\n\n"
eot_str = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
user_str = "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"

prompts = []
for sample in tqdm(dataset['train'], total=len(dataset['train'])):
    judgement_prompt = sample["src"]
    judgement_prompt = judgement_prompt.replace(sys_str, "")
    sys_idx = judgement_prompt.index(user_str)
    system_prompt = judgement_prompt[:sys_idx]
    judgement_prompt = judgement_prompt.replace(user_str, "")
    judgement_prompt = judgement_prompt[sys_idx:]
    judgement_prompt = judgement_prompt.replace(eot_str, "")
    if sample["instruction"] not in hash_id2content:
        continue
    else:
        hash_id = sample["instruction"]
        instruction = hash_id2content[hash_id]
        judgement_prompt = judgement_prompt.replace(hash_id, instruction)
    answer = sample['tgt_chosen']
    if("[[A]]" in answer):
        label = 1
    elif("[[B]]" in answer):
        label = 0
    else:
        continue
    # print("System____")
    # print(system_prompt)
    # print("Judgement____")
    # print(judgement_prompt)
    # print("____")
    # print(label)
    # print(answer)
    prompts.append({"messages": [{"role": "system", "content": system_prompt},
                                 {"role": "user", "content": judgement_prompt}],
                    "label": label})

In [None]:
import pickle

# Save the list to a pickle file
with open("prompt_pre_judge.pkl", "wb") as file:
    pickle.dump(prompts, file)

In [None]:
import pickle
from tqdm import tqdm
with open("prompt_pre_judge.pkl", "rb") as file:
    loaded_list = pickle.load(file)

print(loaded_list[0])

In [None]:
filtered_list = loaded_list[::4]
print(len(filtered_list), len(loaded_list))

In [None]:
prompt_token_ids = [tokenizer.apply_chat_template(messages['messages'], add_generation_prompt=True, tokenize=True) for messages in tqdm(filtered_list, total=len(filtered_list))]
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)

generated_judgements = [output.outputs[0].text for output in outputs]
print(generated_judgements[0])

In [None]:
filtered_list[9000]

In [None]:
len(generated_judgements), generated_judgements[9000]

In [None]:
count_correct = 0
A_correct = []
B_correct = []
count_correct_format_incorrect_answer = 0
for judge_info, judgement in zip(filtered_list, generated_judgements):
    label = judge_info['label']
    messages = judge_info['messages']
    messages = judge_info['messages'][0:2]
    if(("[[A]]" in judgement and label == 1) or ("[[B]]" in judgement and label == 0)):
        count_correct += 1
        messages.append({"role": "assistant", "content": judgement})
        if("[[A]]" in judgement):
            A_correct.append(messages)
        else:
            B_correct.append(messages)
    else:
        # print(judgement)
        # print("*************")
        if("[[A]]" in judgement or "[[B]]" in judgement):
            count_correct_format_incorrect_answer += 1
print("Correct", count_correct, "/", len(generated_judgements))
print("A correct: ", len(A_correct))
print("B correct: ", len(B_correct))
print("Formatted Incorrect: ", count_correct_format_incorrect_answer, "/", len(generated_judgements))

In [None]:
# Balance the dataset
import random
if(len(A_correct) > len(B_correct)):
    A_correct_sampled = random.sample(A_correct, len(B_correct))
    B_correct_sampled = B_correct
else:
    B_correct_sampled = random.sample(B_correct, len(A_correct))
    A_correct_sampled = A_correct

print(len(A_correct_sampled), len(B_correct_sampled))

#calculate token statistics for the generated responses
tokenizer = AutoTokenizer.from_pretrained(model_name)

def calculate_token_statistics(messages):
    token_counts = []
    for message in messages:
        tokens = tokenizer.apply_chat_template(message, add_generation_prompt=False, tokenize=True)
        token_counts.append(len(tokens))
    print("Mean Tokens: ", np.mean(token_counts))
    print("Median Tokens: ", np.median(token_counts))
    print("Max Tokens: ", np.max(token_counts))
    print("Min Tokens: ", np.min(token_counts))
    print("STD Tokens: ", np.std(token_counts))

calculate_token_statistics(A_correct_sampled+ B_correct_sampled)

# Save the dataset
import json

all_sft_samples = A_correct_sampled + B_correct_sampled
json_array = []
for i, judgement in enumerate(all_sft_samples):
    json_array.append({"messages": judgement})

# Define the output file path
output_file_path = 'all_judgements_phi.json'

# Write the list of dictionaries to the JSON file
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(json_array, f, ensure_ascii=False, indent=4)

In [None]:
worse_response_prompt_template = load_textfile_as_string('./prompts/worse_response_v2.prompt')
print(worse_response_prompt_template)
judging_prompt_template = load_textfile_as_string('./prompts/eval_plan.prompt')
print(judging_prompt_template)

In [None]:
generated_responses_pos_prompts = get_response_prompts(all_instructions)
prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) for messages in generated_responses_pos_prompts]
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)

generated_responses_pos = [output.outputs[0].text for output in outputs]
print(generated_responses_pos[0])

In [None]:
all_instructions_pos_neg = get_worse_response_question_prompts(all_instructions, generated_responses_pos, worse_response_prompt_template)
prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) for messages in all_instructions_pos_neg]
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)

all_instructions_neg = [output.outputs[0].text for output in outputs]
print(all_instructions_neg[0])

In [None]:
all_instructions[0]

In [None]:
generated_responses_pos[0]

In [None]:
all_instructions_neg[0]

In [None]:
f_all_instructions, f_generated_responses_pos, f_all_instructions_neg = extract_negative_questions(all_instructions, generated_responses_pos, all_instructions_neg)

In [None]:
len(all_instructions), len(f_all_instructions), len(f_generated_responses_pos), len(f_all_instructions_neg)

In [None]:
# items_to_judge = extract_negative_responses(all_instructions, generated_responses_pos, generated_responses_neg)
# print(items_to_judge[0])
# print(len(items_to_judge))

generated_responses_neg_prompts = get_response_prompts(f_all_instructions_neg)
prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) for messages in generated_responses_neg_prompts]
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)

f_generated_responses_neg = [output.outputs[0].text for output in outputs]
print(f_generated_responses_neg[0])

In [None]:
all_judgements, labels = create_judge_response_prompts(f_all_instructions, f_generated_responses_pos, f_generated_responses_neg, judging_prompt_template)
prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) for messages in all_judgements]
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)

generated_judgements = [output.outputs[0].text for output in outputs]
print(generated_judgements[0])


In [None]:
idx = 0
print(f_all_instructions[idx])
print("_____"*10)
print(f_generated_responses_pos[idx])
print("_____"*10)
print(f_all_instructions_neg[idx])
print("_____"*10)
print(f_generated_responses_neg[idx])


In [None]:
count_correct = 0
A_correct = []
B_correct = []
count_correct_format_incorrect_answer = 0
for judge_prompt, judgement, label in zip(all_judgements, generated_judgements, labels):
    if(("[[A]]" in judgement and label == 1) or ("[[B]]" in judgement and label == 0)):
        count_correct += 1
        if("[[A]]" in judgement):
            A_correct.append([{"role": "system", "content": "You are a helpful AI assistant."}, 
                                {"role": "user", "content": judge_prompt[1]["content"]},
                                {"role": "assistant", "content": judgement}])
        else:
            B_correct.append([{"role": "system", "content": "You are a helpful AI assistant."}, 
                                {"role": "user", "content": judge_prompt[1]["content"]},
                                {"role": "assistant", "content": judgement}])
    else:
        # print(judgement)
        # print("*************")
        if("[[A]]" in judgement or "[[B]]" in judgement):
            count_correct_format_incorrect_answer += 1
print("Correct", count_correct, "/", len(all_judgements))
print("A correct: ", len(A_correct))
print("B correct: ", len(B_correct))
print("Formatted Incorrect: ", count_correct_format_incorrect_answer, "/", len(all_judgements))

In [None]:
# Balance the dataset
import random
if(len(A_correct) > len(B_correct)):
    A_correct_sampled = random.sample(A_correct, len(B_correct))
    B_correct_sampled = B_correct
else:
    B_correct_sampled = random.sample(B_correct, len(A_correct))
    A_correct_sampled = A_correct

#calculate token statistics for the generated responses
tokenizer = AutoTokenizer.from_pretrained(model_name)

def calculate_token_statistics(messages):
    token_counts = []
    for message in messages:
        tokens = tokenizer.apply_chat_template(message, add_generation_prompt=False, tokenize=True)
        token_counts.append(len(tokens))
    print("Mean Tokens: ", np.mean(token_counts))
    print("Median Tokens: ", np.median(token_counts))
    print("Max Tokens: ", np.max(token_counts))
    print("Min Tokens: ", np.min(token_counts))
    print("STD Tokens: ", np.std(token_counts))

calculate_token_statistics(A_correct_sampled+ B_correct_sampled)

# Save the dataset
import json

all_sft_samples = A_correct_sampled + B_correct_sampled
json_array = []
for i, judgement in enumerate(all_sft_samples):
    json_array.append({"messages": judgement})

# Define the output file path
output_file_path = 'all_judgements_llama.json'

# Write the list of dictionaries to the JSON file
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(json_array, f, ensure_ascii=False, indent=4)

In [None]:
len(A_correct_sampled), len(B_correct_sampled)

In [None]:
#calculate token statistics for the generated responses
tokenizer = AutoTokenizer.from_pretrained(model_name)

def calculate_token_statistics(messages):
    token_counts = []
    for message in messages:
        tokens = tokenizer.apply_chat_template(message, add_generation_prompt=False, tokenize=True)
        token_counts.append(len(tokens))
    print("Mean Tokens: ", np.mean(token_counts))
    print("Median Tokens: ", np.median(token_counts))
    print("Max Tokens: ", np.max(token_counts))
    print("Min Tokens: ", np.min(token_counts))
    print("STD Tokens: ", np.std(token_counts))

calculate_token_statistics(A_correct_sampled+ B_correct_sampled)

In [None]:
# Save the dataset
import json

all_sft_samples = A_correct_sampled + B_correct_sampled
json_array = []
for i, judgement in enumerate(all_sft_samples):
    json_array.append({"messages": judgement})

# Define the output file path
output_file_path = 'all_judgements_llama.json'

# Write the list of dictionaries to the JSON file
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(json_array, f, ensure_ascii=False, indent=4)