# vLLM Speed UP Inference

In [1]:
import os

os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
from vllm import LLM, SamplingParams
max_model_len, tp_size = 4000, 1
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import numpy as np

def get_response_prompts(instructions):
    positive_prompts = []
    for instruction in instructions:
        messages = [
            {"role": "system", "content": "You are a helpful AI assistant."},
            {"role": "user", "content": instruction}
        ]
        positive_prompts.append(messages)
    return positive_prompts

def get_worse_response_question_prompts(instructions, positive_responses, worse_response_prompt_template):
    worse_prompts = []
    for instruction, response in zip(instructions, positive_responses):
        messages = [
            {"role": "system", "content": "You are a helpful AI assistant."},
            {"role": "user", "content": worse_response_prompt_template.format(question=instruction, response=response)}
        ]
        worse_prompts.append(messages)
    return worse_prompts

def extract_negative_answer(instructions, positive_responses, inputs):
    all_negative_answers = []
    temp_instructions = []
    temp_positive_responses = []
    for idx, item in enumerate(inputs):
        try:
            # calc occurance of [Altered Response]
            num_occurrences = item.count("[Altered Response]")
            if(num_occurrences > 1):
                print("Multiple Altered Responses Found! ID:", idx, "Count:", num_occurrences)
                continue
            start = item.index("[Altered Response]")+len("[Altered Response] ")
            if("[End Altered Response]" in item):
                end = item.index("[End Altered Response]")
            else:
                continue
            answer = item[start:end]
            # remove all new lines at end of string 
            answer = answer.rstrip("\n ")
            # add a space at start of string if it does not exist
            if(answer[0] != " "):
                answer = " " + answer
            all_negative_answers.append(answer)
            temp_instructions.append(instructions[idx])
            temp_positive_responses.append(positive_responses[idx])
        except:
            print("Generation String Not Found! ID:", idx)
            # print(negative)
    return temp_instructions, temp_positive_responses, all_negative_answers

def extract_negative_responses(instructions, positive_responses, generated_text):
    all_responses = []
    for idx, (instruction, positive, negative) in enumerate(zip(instructions, positive_responses, generated_text)):
        try:
            start = negative.index("[The start of Modified Instruction Response]")+len("[The start of Modified Instruction Response] ")
            if("[The end of Modified Instruction Response]" in negative):
                end = negative.index("[The end of Modified Instruction Response]")
            else:
                end = len(negative)
            negative = negative[start:end]
            all_responses.append({"instruction": instruction, "positive": positive, "negative": negative})
        except:
            print("Generation String Not Found! ID:", idx)
            # print(negative)
    return all_responses

def create_judge_response_prompts(instructions, positives, negatives, judging_template):
    judge_prompts = []
    labels = []
    for instruction, positive, negative in zip(instructions, positives, negatives):
        rand_num = np.random.rand()
        if(rand_num < 0.5):
            messages = [
                {"role": "system", "content": "You are a helpful AI assistant."},
                {"role": "user", "content": judging_template.format(input=instruction, 
                                                                    generation=positive, 
                                                                    generation2=negative)},
            ]
            judge_prompts.append(messages)
            labels.append(1)
        else:
            messages = [
                {"role": "system", "content": "You are a helpful AI assistant."},
                {"role": "user", "content": judging_template.format(input=instruction, 
                                                                    generation=negative, 
                                                                    generation2=positive)},
            ]
            judge_prompts.append(messages)
            labels.append(0)
    return judge_prompts, labels

def load_textfile_as_string(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()



In [2]:
model_name = "microsoft/Phi-3.5-mini-instruct"
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

In [3]:
messages = [
            {
                "role": "system",
                "content": "You are a helpful AI assistant."
            },
            {
                "role": "user",
                "content": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Begin your evaluation by first verifying whether each response contains any obvious or subtle errors. Then propose an appropriate evaluaiton rubric, e.g. 1-5 criteria that are important for evaluating responses to this specific user question. Continue your evaluation by checking each response carefully along those criteria. Based on the analysis in previous steps, choose which response is better overall. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your evaluation, output your final verdict by strictly following this format: \\\"[[A]]\\\" if assistant A is better, \\\"[[B]]\\\" if assistant B is better. \n\n[User Question]\nfast reading method when first few letters of words are bold\n\n[The Start of Assistant A's Answer]\nYou're referring to a technique called \"Rapid Serial Visual Presentation\" (RSVP) with bolded first few letters! This method can indeed help you read faster and more efficiently. Here's how it works:\n\n1. Identify the first few letters of each word that are bolded. This can be 2-3 letters, depending on your preference.\n2. Focus on the bolded letters to quickly identify the word. This helps your brain to recognize the word more quickly and accurately.\n3. Move your eyes to the next word, repeating the process of identifying the bolded letters and recognizing the word.\n4. Continue this process at a rapid pace, moving your eyes from one word to the next.\n\nThis technique can help you read faster in several ways:\n\n* Reduced eye movement: By focusing on the bolded letters, you reduce the amount of eye movement needed to read each word, allowing you to move more quickly through the text.\n* Increased recognition: The bolded letters help your brain to recognize words more quickly, as your brain is able to latch onto the familiar pattern of letters.\n* Improved processing: The bolded letters can also help your brain to process the words more efficiently, as the bolded letters provide a clear cue for your brain to recognize the word.\n\nTo get the most out of this technique, try the following:\n\n* Start with a moderate pace and gradually increase your speed as you become more comfortable with the technique.\n* Practice with different texts and materials to get a feel for how the technique works with different types of writing.\n* Experiment with different bolding patterns, such as bolding the first letter of each word or the first letter of every other word, to find what works best for you.\n\nRemember, the key to success with this technique is to practice regularly and develop a consistent reading rhythm. With time and practice, you can improve your reading speed and comprehension using this method!\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\nYou're interested in learning a fast reading method that utilizes visual cues to improve recognition and processing! I'd be happy to help you with that.\n\nThe method I'd like to introduce you to is called the \"Meta-Guiding\" technique. It's a visual cue-based method that can help you read faster and more efficiently.\n\n**What is Meta-Guiding?**\n\nMeta-Guiding is a technique that uses visual cues to guide your eyes as you read. The goal is to train your eyes to move more efficiently across the page, recognizing words and phrases more quickly, and processing the information more effectively.\n\n**How to Implement Meta-Guiding:**\n\n1. **Start by choosing a text**: Select a passage that you want to read quickly and efficiently. It can be a short article, a chapter, or even a book.\n2. **Identify the visual cues**: As you read, pay attention to the visual cues that catch your attention. These can be:\n\t* Headings and subheadings\n\t* Bold or italicized text\n\t* Images or graphics\n\t* Quotes or dialogue\n\t* Numbered or bulleted lists\n3. **Use these cues as guides**: As you read, use these visual cues to guide your eyes across the page. For example, if you see a heading, use it as a reference point to scan the surrounding text.\n4. **Practice active reading**: As you read, ask yourself questions like:\n\t* What is the main idea of this section?\n\t* What is the author trying to say?\n\t* What are the key points being made?\n5. **Use your peripheral vision**: Instead of focusing on individual words, use your peripheral vision to take in larger chunks of text. This will help you recognize patterns and relationships between words.\n6. **Practice, practice, practice!**: The more you practice using visual cues to guide your reading, the more efficient and effective you'll become.\n\n**Benefits of Meta-Guiding:**\n\n1. **Faster reading speeds**: By using visual cues, you can train your eyes to move more quickly across the page, allowing you to read faster.\n2. **Improved comprehension**: By focusing on the main ideas and key points, you'll be able to comprehend the text more effectively.\n3. **Reduced eye fatigue**: By using your peripheral vision and focusing on visual cues, you'll reduce eye fatigue and strain.\n4. **Increased retention**: By actively engaging with the text and using visual cues, you'll be able to retain more information.\n\n**Tips and Variations:**\n\n1. **Start with short texts**: Begin with shorter texts and gradually move to longer ones as you become more comfortable with the technique.\n2. **Experiment with different types of text**: Try using Meta-Guiding with different types of text, such as articles, blog posts, or even social media updates.\n3. **Combine with other reading techniques**: You can combine Meta-Guiding with other reading techniques, such as skimming or scanning, to further improve your reading efficiency.\n\nI hope this helps you get started with the Meta-Guiding technique! Remember to practice regularly and have fun exploring the benefits of this visual cue-based method.\n[The End of Assistant B's Answer]"
            }
        ]
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt_token_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
print(prompt_token_ids)

<|system|>
You are a helpful AI assistant.<|end|>
<|user|>
Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Begin your evaluation by first verifying whether each response contains any obvious or subtle errors. Then propose an appropriate evaluaiton rubric, e.g. 1-5 criteria that are important for evaluating responses to this specific user question. Continue your evaluation by checking each response carefully along those criteria. Based on the analysis in previous steps, choose which response is better overall. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After provi

In [4]:


torch.random.manual_seed(0)


sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=2000)
llm = LLM(model=model_name, tensor_parallel_size=tp_size, max_model_len=max_model_len, 
          trust_remote_code=True, enforce_eager=True, gpu_memory_utilization=0.90)


INFO 01-01 21:00:21 llm_engine.py:232] Initializing an LLM engine (v0.6.1) with config: model='microsoft/Phi-3.5-mini-instruct', speculative_config=None, tokenizer='microsoft/Phi-3.5-mini-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=microsoft/Phi-3.5-mini-instruct, use_v2_block_manager=False, num_scheduler_steps=1, enable_prefix_caching=False,

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 01-01 21:00:24 model_runner.py:997] Starting to load model microsoft/Phi-3.5-mini-instruct...
INFO 01-01 21:00:24 selector.py:240] Cannot use FlashAttention-2 backend due to sliding window.
INFO 01-01 21:00:24 selector.py:116] Using XFormers backend.
INFO 01-01 21:00:24 weight_utils.py:242] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 01-01 21:00:26 model_runner.py:1008] Loading model weights took 7.1659 GB
INFO 01-01 21:00:27 gpu_executor.py:122] # GPU blocks: 2325, # CPU blocks: 682


In [5]:
question = "Give me a brief history of the Roman Empire."
# o_question = "What are the benefits of stretching?"
# o_question = "Write me a python function that outputs the number of parameters in a pytorch model."
# o_question = "A train leaves Station A at 8:00 AM, traveling at 60 km/h. Another train departs Station B, 120 km away from Station A, at 9:00 AM, traveling toward Station A at 80 km/h. At what time do the two trains meet?"
o_question = """The island of Solara is home to a unique species of bird called the Solara Finch. These finches primarily feed on a specific type of berry that grows on tall bushes. During the dry season, the berry supply diminishes, forcing the finches to travel farther to find food. Researchers have observed that during the dry season, the finches’ wing muscles become stronger, allowing them to fly greater distances. This adaptation enables them to survive harsh conditions. However, researchers are concerned that prolonged dry seasons due to climate change may outpace the finches' ability to adapt.

# Question: Based on the passage, which of the following is the most likely outcome if the dry season becomes significantly longer over time?

# A) The Solara Finch population will increase due to improved flight capabilities.
# B) The Solara Finch population may decline because they might not adapt quickly enough.
# C) The finches will switch to a diet of insects to compensate for the loss of berries.
# D) The finches will migrate permanently to areas where berries are abundant year-round."""
o_question = """In recent years, the rapid advancement of artificial intelligence (AI) has sparked debates about its ethical implications and societal impact. Proponents argue that AI can solve complex problems, from predicting climate patterns to diagnosing diseases, potentially improving countless lives. Critics, however, warn of potential job displacement, biased algorithms, and privacy concerns. Some researchers advocate for stronger regulatory frameworks to ensure AI development aligns with human values, while others fear that excessive regulation may stifle innovation and delay technological progress.

# Question:
# Discuss the potential trade-offs involved in implementing stricter regulations on artificial intelligence development, considering both ethical and societal perspectives."""

In [6]:
generated_responses_pos_prompts = [[
            {
                "role": "system",
                "content": "You are a helpful AI assistant."
            },
            {
                "role": "user",
                "content": o_question
            }
        ]]
prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) for messages in generated_responses_pos_prompts]
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)
generated_responses_pos = [output.outputs[0].text for output in outputs]
print(generated_responses_pos[0])

Processed prompts: 100%|██████████| 1/1 [00:10<00:00, 10.80s/it, est. speed input: 15.92 toks/s, output: 52.39 toks/s]

 Implementing stricter regulations on artificial intelligence (AI) development involves a complex set of trade-offs between ethical considerations and societal impacts. Here are some key points to consider:

1. Ethical concerns:
   - Ensuring fairness: Stricter regulations can help address issues related to bias in AI algorithms. By mandating transparency in data collection and processing methods, regulators can ensure that AI systems are not inadvertently perpetuating discrimination or reinforcing existing social inequalities.
   - Protecting privacy: AI technologies often rely on vast amounts of personal data to function effectively. Stricter regulations can help safeguard individuals' privacy by imposing limits on data collection, usage, and storage.
   - Accountability: Clear legal frameworks can hold AI developers and users responsible for any harm caused by their systems. This can help prevent misuse of AI technologies and ensure that developers take appropriate measures to minim




In [7]:
# Judgement
judgement_prompt = load_textfile_as_string("../prompts/mistake.prompt")
formatted_prompt = judgement_prompt.format(question=o_question, response=generated_responses_pos[0])

generated_responses_pos_prompts = [[
            {
                "role": "system",
                "content": "You are a helpful AI assistant."
            },
            {
                "role": "user",
                "content": formatted_prompt
            }
        ]]
prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) for messages in generated_responses_pos_prompts]
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)
outs = [output.outputs[0].text for output in outputs]
judgement = outs[0]
print(judgement)

Processed prompts: 100%|██████████| 1/1 [00:11<00:00, 11.24s/it, est. speed input: 80.27 toks/s, output: 51.61 toks/s]

 [Strategy]
Highlight potential negative aspects of stricter AI regulations while maintaining the original structure and length.

[Altered Response]
Implementing stricter regulations on artificial intelligence (AI) development involves a nuanced interplay of potential drawbacks alongside ethical and societal considerations. Here are some critical aspects to consider:

1. Ethical concerns:
   - Hindering innovation: While stricter regulations aim to ensure fairness and protect privacy, they could inadvertently slow down the pace of AI breakthroughs by imposing excessive constraints on research and development.
   - Over-regulation risks: An overly stringent regulatory environment might lead to a lack of diversity in AI applications, as developers may steer clear of innovative projects that could be deemed too risky or uncertain to meet regulatory standards.
   - Stifling creativity: In the quest to prevent bias and protect privacy, regulations could unintentionally limit the creative ex




In [8]:
from datasets import load_dataset
from tqdm import tqdm
import pickle

#dataset is a subset of WildChat
dataset = load_dataset("facebook/Self-taught-evaluator-DPO-data")
WildChat = load_dataset("allenai/WildChat-1M")

# get hashes
#load the hash_id2content dictionary
# with open("hash_id2content.pkl", "rb") as f:
#   hash_id2content = pickle.load(f)
hash_id2content = dict()
for ex in tqdm(WildChat["train"], total=len(WildChat["train"])):
  turn = ex["turn"]
  hash_id2content[ex["conversation_hash"]] = ex["conversation"][2 * (turn - 1)]["content"]


print("Starting 2.")
train_data = []
for ex in tqdm(dataset["train"], total=len(dataset["train"])):
  if ex["instruction"] not in hash_id2content:
    continue
  else:
    ex["src"] = ex["src"].replace(ex["instruction"], hash_id2content[ex["instruction"]])
    train_data.append(ex)

print("Starting 3.")
# Extract Instructions
skip = 6 # I found that the dataset instructions has the same instruction repeated 6 times? Not sure why. 
all_instructions = []
num_responses = len(dataset['train'])//skip
for i in tqdm(range(0, num_responses*skip, skip), total=num_responses):
    try:
        instruction_example = hash_id2content[dataset['train'][i]['instruction']]
        all_instructions.append(instruction_example)
    except:
        continue
print(all_instructions[0])

100%|██████████| 837989/837989 [06:52<00:00, 2033.62it/s]


Starting 2.


100%|██████████| 57468/57468 [00:02<00:00, 21700.81it/s]


Starting 3.


100%|██████████| 9578/9578 [00:00<00:00, 19318.08it/s]

How have Cloud and 5G helped make Extended Reality (XR) technology more accessible to a wide audience?


by making XR applications and devices increasingly reliant on tethered connections


by providing a more flexible and affordable way to render quality images at high speeds


by requiring users to upgrade their headsets with new graphic processing units


by using blockchain databases to track and manage user activity within XR environments





In [9]:
worse_response_prompt_template = load_textfile_as_string('../prompts/mistake.prompt')
print(worse_response_prompt_template)
judging_prompt_template = load_textfile_as_string('../prompts/eval_plan.prompt')
print(judging_prompt_template)

Given the [Question] and [Response] below, your goal is to rewrite the answer to make it slightly unpreferable compared to the original.
First, indicate how you will generate a slightly worse response than the original after the template string '[Strategy]'.
Second, provide an altered response after the template string '[Altered Response]'.
Important: make the altered response the approximately the SAME length or longer than the original [Response], but do not copy it.

[Question]
{question}

[Response]
{response}

------------------
Generate your answer with the following template (replacing <text>):

[Strategy]
<text>

[Altered Response]
<text>
[End Altered Response]


Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Begin your evaluation by first verifying whether each response conta

In [10]:
all_instructions = all_instructions[:200]

In [11]:
generated_responses_pos_prompts = get_response_prompts(all_instructions)
print(len(generated_responses_pos_prompts))
prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) for messages in generated_responses_pos_prompts]
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)

generated_responses_pos = [output.outputs[0].text for output in outputs]
print(generated_responses_pos[0])

200


Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:   0%|          | 1/200 [00:00<02:45,  1.20it/s, est. speed input: 7539.36 toks/s, output: 0.00 toks/s]



Processed prompts:   1%|          | 2/200 [00:01<03:01,  1.09it/s, est. speed input: 6709.92 toks/s, output: 0.00 toks/s]



Processed prompts:   2%|▏         | 3/200 [00:02<02:19,  1.41it/s, est. speed input: 7129.93 toks/s, output: 0.00 toks/s]



Processed prompts:  29%|██▉       | 58/200 [00:44<02:08,  1.11it/s, est. speed input: 679.32 toks/s, output: 730.24 toks/s]



Processed prompts:  40%|███▉      | 79/200 [01:01<01:03,  1.91it/s, est. speed input: 659.69 toks/s, output: 788.67 toks/s]



Processed prompts:  44%|████▎     | 87/200 [01:07<01:38,  1.15it/s, est. speed input: 753.37 toks/s, output: 783.09 toks/s]



Processed prompts:  56%|█████▌    | 112/200 [01:23<00:47,  1.84it/s, est. speed input: 858.02 toks/s, output: 809.29 toks/s]



Processed prompts: 100%|██████████| 200/200 [02:32<00:00,  1.31it/s, est. speed input: 662.41 toks/s, output: 856.07 toks/s]

 Cloud and 5G technologies have significantly contributed to making Extended Reality (XR) more accessible to a broad audience primarily by providing a more flexible and affordable way to render high-quality images at high speeds. The integration of cloud computing with XR technologies enables the processing and storage of large amounts of data needed for immersive experiences, which is difficult for individual devices to handle. Additionally, the high-speed connectivity offered by 5G networks ensures minimal latency and high bandwidth, which are essential for real-time interactions in XR applications. This combination allows for a seamless and scalable experience, enabling more users to access and enjoy XR content without the need for expensive hardware upgrades, such as new graphic processing units. Moreover, cloud and 5G infrastructure can support numerous users simultaneously, making XR experiences more interactive and collaborative.


Blockchain is not directly related to the incre




In [12]:
all_instructions_pos_neg = get_worse_response_question_prompts(all_instructions, generated_responses_pos, worse_response_prompt_template)
prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) for messages in all_instructions_pos_neg]
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)

all_instructions_neg = [output.outputs[0].text for output in outputs]
print(all_instructions_neg[0])

Processed prompts:   4%|▍         | 9/200 [00:18<03:47,  1.19s/it, est. speed input: 498.39 toks/s, output: 139.37 toks/s]



Processed prompts:   7%|▋         | 14/200 [00:27<04:48,  1.55s/it, est. speed input: 504.04 toks/s, output: 191.40 toks/s]



Processed prompts:  16%|█▋        | 33/200 [00:42<02:15,  1.23it/s, est. speed input: 946.00 toks/s, output: 393.66 toks/s]



Processed prompts:  18%|█▊        | 36/200 [00:43<01:26,  1.90it/s, est. speed input: 1052.55 toks/s, output: 413.67 toks/s]



Processed prompts:  22%|██▏       | 44/200 [00:49<02:19,  1.12it/s, est. speed input: 1218.00 toks/s, output: 446.33 toks/s]



Processed prompts:  30%|███       | 60/200 [01:06<02:57,  1.27s/it, est. speed input: 1222.93 toks/s, output: 452.38 toks/s]



Processed prompts:  32%|███▏      | 63/200 [01:07<01:41,  1.34it/s, est. speed input: 1290.87 toks/s, output: 463.77 toks/s]



Processed prompts:  42%|████▏     | 83/200 [01:26<01:41,  1.15it/s, est. speed input: 1345.80 toks/s, output: 485.59 toks/s]



Processed prompts:  44%|████▍     | 89/200 [01:32<01:45,  1.05it/s, est. speed input: 1371.17 toks/s, output: 493.05 toks/s]



Processed prompts:  50%|████▉     | 99/200 [01:38<01:00,  1.66it/s, est. speed input: 1498.19 toks/s, output: 509.59 toks/s]



Processed prompts:  52%|█████▏    | 103/200 [01:39<00:53,  1.80it/s, est. speed input: 1607.13 toks/s, output: 513.54 toks/s]



Processed prompts: 100%|██████████| 200/200 [03:09<00:00,  1.06it/s, est. speed input: 1401.35 toks/s, output: 537.28 toks/s]

 [Strategy]
By emphasizing network slicing's ability to prioritize XR traffic, highlighting how it maintains high Quality of Service (QoS) for critical XR functions.

[Altered Response]
Network slicing, a cornerstone of 5G, significantly enhances XR experiences by offering tailored network configurations for various XR applications. This ensures that essential XR functions receive priority, maintaining high Quality of Service (QoS) levels crucial for uninterrupted and responsive virtual interactions. For instance, a slice dedicated to real-time gaming in XR can guarantee low latency and high bandwidth, while another slice could prioritize high-resolution streaming content. In multi-user environments, this dedicated slicing ensures that all users experience smooth interactions without degradation in performance due to network congestion.

[End Altered Response]

----


### Follow-up Question 3

Considering the environmental impact of data centers, what sustainable practices could be imp




In [13]:
f_all_instructions, f_generated_responses_pos, f_generated_responses_neg = extract_negative_answer(all_instructions, generated_responses_pos, all_instructions_neg)
# filter based on length difference 
f_all_instructions, f_generated_responses_pos, f_generated_responses_neg = zip(*[(a, b, c) for a, b, c in zip(f_all_instructions, f_generated_responses_pos, f_generated_responses_neg) if abs(len(b)-len(c)) < 1000])

Generation String Not Found! ID: 37
Generation String Not Found! ID: 52
Generation String Not Found! ID: 57
Generation String Not Found! ID: 61
Generation String Not Found! ID: 70
Generation String Not Found! ID: 80
Generation String Not Found! ID: 83
Generation String Not Found! ID: 84
Generation String Not Found! ID: 92
Generation String Not Found! ID: 95
Generation String Not Found! ID: 105
Generation String Not Found! ID: 111
Generation String Not Found! ID: 124
Generation String Not Found! ID: 126
Generation String Not Found! ID: 133
Generation String Not Found! ID: 148


In [14]:
idx = 7
# print(f_all_instructions[idx])
print(f_generated_responses_pos[idx])
print("000000000")
print(f_generated_responses_neg[idx])


 Umbritin is a fictional cold gas giant planet that we are imagining, drawing inspiration from the characteristics of known exoplanets, such as TrES-2b. To describe Umbritin, let's assume it possesses similarities to TrES-2b, which is known for its extreme red coloration, indicating a unique atmospheric composition and temperature.

**Name**: Umbritin

**Type**: Cold Gas Giant

**Orbital Characteristics**:
- Distant from its host star, allowing for a colder temperature.
- Likely an exoplanet in a distant, possibly eccentric orbit.

**Size and Composition**:
- Approximately 1.7 times the radius of Jupiter, making it a massive planet.
- Composed mainly of hydrogen and helium, with possible heavier elements in the core.
- Possesses an extensive atmosphere with high cloud tops and a deep, possibly turbulent, lower atmosphere.

**Coloration**:
- Umbritin's surface resembles that of TrES-2b, with a deep red color.
- The coloration arises from the atmospheric constituents, specifically:
  1. 

In [15]:
# calculate average length of the responses
pos_lengths = [len(response.split()) for response in f_generated_responses_pos]
neg_lengths = [len(response.split()) for response in f_generated_responses_neg]
print("Average Length of Positive Responses:", np.mean(pos_lengths), "STD: ", np.std(pos_lengths))
print("Average Length of Negative Responses:", np.mean(neg_lengths), "STD: ", np.std(neg_lengths))

Average Length of Positive Responses: 324.4296875 STD:  143.85309800679423
Average Length of Negative Responses: 283.625 STD:  149.04073478415222


In [16]:
# create list of the difference in length between the positive and negative responses
diff_lengths = [len(pos.split()) - len(neg.split()) for pos, neg in zip(f_generated_responses_pos, f_generated_responses_neg)]
print(len(diff_lengths), diff_lengths)

128 [23, -196, 37, 40, -39, 19, 126, 73, 126, -50, 57, 82, 159, -31, -31, -44, 99, 92, 31, 89, 115, 3, -52, 99, -1, -15, 47, -59, 32, 132, 36, 62, 2, -19, 14, 124, -107, 138, 62, 158, 18, 7, 91, -23, 55, 78, 134, 70, 70, 33, -18, 99, -20, -56, -12, 146, -3, 123, 28, -44, 76, 95, 144, 12, -41, -11, -86, 80, 95, 22, -79, -1, 7, 117, 55, 83, 31, -48, 17, 0, 45, 54, 35, 115, -27, 90, 135, -48, 55, 115, 94, 125, 136, 111, 81, -12, 21, 113, 156, 66, 91, 78, -33, 57, 15, 31, 1, -71, -41, -57, 144, 28, 54, 37, -38, 171, 57, 40, 62, 123, 26, 121, -7, 11, -52, 96, 25, 17]


In [17]:
len(f_generated_responses_neg), 

(128,)

In [18]:
f_generated_responses_neg[0]

" The learning objective SPQ-2.B aims to shed light on the quantitative relationship between the elemental composition by mass and the composition of substances in mixtures. To grasp this concept, let's delve into its essential knowledge components:\n\n1. SPQ-2.B.1: The distinction between pure substances and mixtures\n\nPure substances are made up of molecules or formula units solely of one type of element or compound. For instance, elements like gold (Au) or compounds like water (H2O) fall under this category. Their composition remains consistent throughout the substance, as they consist of a single type of molecule.\n\nOn the other hand, mixtures consist of molecules or formula units of several different substances. These substances are physically combined but not chemically bonded. Mixtures can be either homogeneous (consistent composition throughout, such as saltwater) or heterogeneous (non-uniform composition, like a salad). The proportions of each substance in a mixture can chan

In [19]:
f_generated_responses_pos[0]

" The learning objective SPQ-2.B focuses on explaining the quantitative relationship between the elemental composition by mass and the composition of substances in a mixture. To understand this concept, let's break it down into its essential knowledge components:\n\n1. SPQ-2.B.1: The difference between pure substances and mixtures\n\nPure substances are composed of molecules or formula units of only one type of element or compound. Examples of pure substances include elements like gold (Au) or compounds like water (H2O). Their composition remains consistent throughout the entire substance, as they are made up entirely of a single type of molecule.\n\nIn contrast, mixtures are composed of molecules or formula units of two or more different types of substances. These substances are physically combined, but not chemically bonded. Mixtures can be either homogeneous (uniform composition throughout, like saltwater) or heterogeneous (non-uniform composition, like a salad). The proportions of 

In [20]:
all_judgements, labels = create_judge_response_prompts(f_all_instructions, f_generated_responses_pos, f_generated_responses_neg, judging_prompt_template)
prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) for messages in all_judgements]
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)

generated_judgements = [output.outputs[0].text for output in outputs]
print(generated_judgements[0])


Processed prompts:  35%|███▌      | 45/128 [01:02<01:33,  1.12s/it, est. speed input: 1189.23 toks/s, output: 359.96 toks/s]



Processed prompts: 100%|██████████| 128/128 [02:25<00:00,  1.13s/it, est. speed input: 1364.56 toks/s, output: 454.36 toks/s]

 Evaluation Rubric:

1. Clarity of Explanation: The explanation should be clear, concise, and easy to understand.
2. Accuracy of Information: The information provided must be correct and based on scientific principles.
3. Coverage of Essential Knowledge: The response should comprehensively cover all points of the essential knowledge.
4. Use of Examples: The response should use relevant examples to illustrate the concepts.
5. Structure and Organization: The response should be well-structured, logically organized, and easy to follow.

Evaluation:

Assistant A:

1. Clarity of Explanation: The explanation is somewhat clear but it introduces uncertainty by saying it aims to "introduce some uncertainty and a less precise understanding".
2. Accuracy of Information: The information is accurate and correctly explains the essential knowledge.
3. Coverage of Essential Knowledge: It covers all essential knowledge points, but the explanation could be more concise.
4. Use of Examples: No examples ar




In [21]:
idx = 0
print(f_all_instructions[idx])
print("_____"*10)
print(f_generated_responses_pos[idx])
print("_____"*10)
print("_____"*10)
print(f_generated_responses_neg[idx])


Teach me this content?
LEARNING OBJECTIVE
SPQ-2.B
Explain the quantitative
relationship between the
elemental composition by
mass and the composition of
substances in a mixture.
ESSENTIAL KNOWLEDGE
SPQ-2.B.1
While pure substances contain molecules or
formula units of a single type, mixtures contain
molecules or formula units of two or more
types, whose relative proportions can vary.
SPQ-2.B.2
Elemental analysis can be used to determine
the relative numbers of atoms in a substance
and to determine its purity.

__________________________________________________
 The learning objective SPQ-2.B focuses on explaining the quantitative relationship between the elemental composition by mass and the composition of substances in a mixture. To understand this concept, let's break it down into its essential knowledge components:

1. SPQ-2.B.1: The difference between pure substances and mixtures

Pure substances are composed of molecules or formula units of only one type of element or compound. Exa

In [22]:
count_correct = 0
A_correct = []
B_correct = []
count_correct_format_incorrect_answer = 0
for judge_prompt, judgement, label in zip(all_judgements, generated_judgements, labels):
    if(("[[A]]" in judgement and label == 1) or ("[[B]]" in judgement and label == 0)):
        count_correct += 1
        if("[[A]]" in judgement):
            A_correct.append([{"role": "system", "content": "You are a helpful AI assistant."}, 
                                {"role": "user", "content": judge_prompt[1]["content"]},
                                {"role": "assistant", "content": judgement}])
        else:
            B_correct.append([{"role": "system", "content": "You are a helpful AI assistant."}, 
                                {"role": "user", "content": judge_prompt[1]["content"]},
                                {"role": "assistant", "content": judgement}])
        if(count_correct < 10):
            print("_______")
            print(judge_prompt[1]["content"])
            print("_______")
            print(judgement)
    else:
        # print(judgement)
        # print("*************")
        if("[[A]]" in judgement or "[[B]]" in judgement):
            count_correct_format_incorrect_answer += 1
            # print("_______")
            # print(judge_prompt[1]["content"])
            # print("_______")
            # print(judgement)
print("Correct", count_correct, "/", len(all_judgements))
print("A correct: ", len(A_correct))
print("B correct: ", len(B_correct))
print("Formatted Incorrect: ", count_correct_format_incorrect_answer, "/", len(all_judgements))

_______
Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Begin your evaluation by first verifying whether each response contains any obvious or subtle errors. Then propose an appropriate evaluaiton rubric, e.g. 1-5 criteria that are important for evaluating responses to this specific user question. Continue your evaluation by checking each response carefully along those criteria. Based on the analysis in previous steps, choose which response is better overall. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your evaluation, output your final verdict by 

In [23]:
# Balance the dataset
import random
if(len(A_correct) > len(B_correct)):
    A_correct_sampled = random.sample(A_correct, len(B_correct))
    B_correct_sampled = B_correct
else:
    B_correct_sampled = random.sample(B_correct, len(A_correct))
    A_correct_sampled = A_correct

In [24]:
len(A_correct_sampled), len(B_correct_sampled)

(49, 49)

In [25]:
#calculate token statistics for the generated responses
tokenizer = AutoTokenizer.from_pretrained(model_name)

def calculate_token_statistics(messages):
    token_counts = []
    for message in messages:
        tokens = tokenizer.apply_chat_template(message, add_generation_prompt=False, tokenize=True)
        token_counts.append(len(tokens))
    print("Mean Tokens: ", np.mean(token_counts))
    print("Median Tokens: ", np.median(token_counts))
    print("Max Tokens: ", np.max(token_counts))
    print("Min Tokens: ", np.min(token_counts))
    print("STD Tokens: ", np.std(token_counts))

calculate_token_statistics(A_correct_sampled+ B_correct_sampled)

Mean Tokens:  1981.438775510204
Median Tokens:  2005.5
Max Tokens:  3708
Min Tokens:  743
STD Tokens:  573.8158359600236


In [26]:
# Save the dataset
import json

all_sft_samples = A_correct_sampled + B_correct_sampled
json_array = []
for i, judgement in enumerate(all_sft_samples):
    json_array.append({"messages": judgement})

# Define the output file path
output_file_path = 'all_judgements_llama.json'

# Write the list of dictionaries to the JSON file
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(json_array, f, ensure_ascii=False, indent=4)