### VLLM Installation, Using Llama-3.2-1B as source model

In [24]:
from vllm import LLM, SamplingParams

model_name = "meta-llama/Llama-3.2-1B"
model = LLM(model = model_name)

INFO 01-07 23:18:26 config.py:510] This model supports multiple tasks: {'classify', 'generate', 'reward', 'score', 'embed'}. Defaulting to 'generate'.
INFO 01-07 23:18:26 config.py:1458] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 01-07 23:18:26 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='meta-llama/Llama-3.2-1B', speculative_config=None, tokenizer='meta-llama/Llama-3.2-1B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_mode


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.76it/s]



INFO 01-07 23:18:28 model_runner.py:1099] Loading model weights took 2.3029 GB
INFO 01-07 23:18:28 model_runner_base.py:120] Writing input of failed execution to /tmp/err_execute_model_input_20250107-231828.pkl...
INFO 01-07 23:18:28 model_runner_base.py:149] Completed writing input of failed execution to /tmp/err_execute_model_input_20250107-231828.pkl.


OutOfMemoryError: Error in model execution (input dumped to /tmp/err_execute_model_input_20250107-231828.pkl): CUDA out of memory. Tried to allocate 252.00 MiB. GPU 0 has a total capacity of 21.98 GiB of which 98.44 MiB is free. Including non-PyTorch memory, this process has 21.86 GiB memory in use. Of the allocated memory 21.42 GiB is allocated by PyTorch, with 24.00 MiB allocated in private pools (e.g., CUDA Graphs), and 71.30 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [25]:
import json
import random
import time
import re

In [26]:
def generate_completion(llm_engine, messages, max_length=1000, temperature=0.7):
    prompt = ""
    for message in messages:
        role = message["role"]
        content = message["content"]
        
        if role == "system":
            prompt += f"System: {content}\n"
        elif role == "user":
            prompt += f"User: {content}\n"
        elif role == "assistant":
            prompt += f"Assistant: {content}\n"
    
    prompt += "Assistant: "
    
    sampling_params = SamplingParams(
        temperature=temperature,
        max_tokens=max_length,
    )
    
    outputs = llm_engine.generate([prompt], sampling_params)
    generated_text = outputs[0].outputs[0].text
    
    completion = {
        "choices": [{
            "message": {
                "role": "assistant",
                "content": generated_text.strip()
            },
            "finish_reason": "stop"
        }],
    }
    
    return completion

### Generate Agent Responses

In [27]:
def construct_message(agents, question, idx):
    if len(agents) == 0:
        return {"role": "user", "content": "Can you double check that your answer is correct. Please reiterate your answer, with your final answer a single numerical number, in the form \\boxed{{answer}}."}

    prefix_string = "These are the solutions to the problem from other agents: "

    for agent in agents:
        agent_response = agent[idx]["content"]
        response = "\n\n One agent solution: ```{}```".format(agent_response)

        prefix_string = prefix_string + response

    prefix_string = prefix_string + """\n\n Using the solutions from other agents as additional information, can you provide your answer to the math problem? \n The original math problem is {}. Your final answer should be a single numerical number, in the form \\boxed{{answer}}, at the end of your response.""".format(question)
    return {"role": "user", "content": prefix_string}


def construct_assistant_message(completion):
    content = completion["choices"][0]["message"]["content"]
    return {"role": "assistant", "content": content}


def read_jsonl(path: str):
    with open(path) as fh:
        return [json.loads(line) for line in fh.readlines() if line]


def run_generation(agents = 1, rounds = 1, seed = 0):
    random.seed(seed)

    generated_description = {}

    questions = read_jsonl("/home/ubuntu/multiagent_debate/grade-school-math/grade_school_math/data/test.jsonl")
    random.shuffle(questions)

    for data in questions[:100]:
        question = data['question']
        answer = data['answer']

        agent_contexts = [[{"role": "user", "content": """Can you solve the following math problem? {} Explain your reasoning. Your final answer should be a single numerical number, in the form \\boxed{{answer}}, at the end of your response. """.format(question)}] for agent in range(agents)]

        for round in range(rounds):        
            for i, agent_context in enumerate(agent_contexts):
                if round != 0:
                    agent_contexts_other = agent_contexts[:i] + agent_contexts[i+1:]
                    message = construct_message(agent_contexts_other, question, 2*round - 1)
                    agent_context.append(message)

                completion = generate_completion(model, agent_context, temperature = 0.1)

                assistant_message = construct_assistant_message(completion)
                agent_context.append(assistant_message)

        generated_description[question] = (agent_contexts, answer)
        break

    json.dump(generated_description, open("gen_data/gsm_{}_{}.json".format(agents, rounds), "w"))

In [23]:
for agent_num in range(1, 8):
    for rounds in range(1, 5):
        start = time.time()
        run_generation(agent_num, rounds)
        
        print(f"Agents: {agent_num}, Rounds: {rounds}, Time Taken: {1000 * (time.time() - start)} ms")

Processed prompts: 100%|██████████| 2/2 [00:07<00:00,  3.57s/it, est. speed input: 169.26 toks/s, output: 280.00 toks/s]


Agents: 1, Rounds: 1, Time Taken: 7153.552055358887 ms


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.15s/it, est. speed input: 11.89 toks/s, output: 139.87 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.23s/it, est. speed input: 155.57 toks/s, output: 138.28 toks/s]


Agents: 1, Rounds: 2, Time Taken: 14397.95708656311 ms


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.16s/it, est. speed input: 11.88 toks/s, output: 139.78 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.26s/it, est. speed input: 154.96 toks/s, output: 137.74 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.42s/it, est. speed input: 291.76 toks/s, output: 134.76 toks/s]


Agents: 1, Rounds: 3, Time Taken: 21861.364364624023 ms


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.18s/it, est. speed input: 11.84 toks/s, output: 139.32 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.25s/it, est. speed input: 155.04 toks/s, output: 138.05 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.42s/it, est. speed input: 291.31 toks/s, output: 134.80 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.54s/it, est. speed input: 424.42 toks/s, output: 132.67 toks/s]


Agents: 1, Rounds: 4, Time Taken: 29412.036418914795 ms


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.20s/it, est. speed input: 11.81 toks/s, output: 138.99 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.31s/it, est. speed input: 11.63 toks/s, output: 136.84 toks/s]


Agents: 2, Rounds: 1, Time Taken: 14518.091917037964 ms


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.30s/it, est. speed input: 11.65 toks/s, output: 137.05 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.39s/it, est. speed input: 11.51 toks/s, output: 135.39 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.54s/it, est. speed input: 291.95 toks/s, output: 132.64 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.57s/it, est. speed input: 290.78 toks/s, output: 132.05 toks/s]


Agents: 2, Rounds: 2, Time Taken: 29826.023817062378 ms


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.40s/it, est. speed input: 11.49 toks/s, output: 135.23 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.33s/it, est. speed input: 11.60 toks/s, output: 136.48 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.57s/it, est. speed input: 291.13 toks/s, output: 132.21 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.63s/it, est. speed input: 288.56 toks/s, output: 131.04 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.93s/it, est. speed input: 545.13 toks/s, output: 126.22 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.78s/it, est. speed input: 555.07 toks/s, output: 128.52 toks/s]


Agents: 2, Rounds: 3, Time Taken: 45677.77991294861 ms


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.33s/it, est. speed input: 11.59 toks/s, output: 136.37 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.49s/it, est. speed input: 57.00 toks/s, output: 138.15 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.36s/it, est. speed input: 190.99 toks/s, output: 135.84 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.37s/it, est. speed input: 190.87 toks/s, output: 135.66 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.59s/it, est. speed input: 464.35 toks/s, output: 131.84 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.63s/it, est. speed input: 461.92 toks/s, output: 131.12 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.98s/it, est. speed input: 706.69 toks/s, output: 125.34 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.92s/it, est. speed input: 712.34 toks/s, output: 126.32 toks/s]


Agents: 2, Rounds: 4, Time Taken: 54736.926317214966 ms


Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.91s/it, est. speed input: 14.39 toks/s, output: 138.65 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.28s/it, est. speed input: 11.68 toks/s, output: 137.42 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.21s/it, est. speed input: 11.80 toks/s, output: 138.82 toks/s]


Agents: 3, Rounds: 1, Time Taken: 20405.33757209778 ms


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.23s/it, est. speed input: 11.76 toks/s, output: 138.33 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.20s/it, est. speed input: 11.80 toks/s, output: 138.85 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.21s/it, est. speed input: 11.79 toks/s, output: 138.72 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.65s/it, est. speed input: 419.50 toks/s, output: 130.81 toks/s]


### Evaluate Generated Answers

In [None]:
def parse_bullets(sentence):
    bullets_preprocess = sentence.split("\n")
    bullets = []

    for bullet in bullets_preprocess:
        try:
            idx = bullet.find(next(filter(str.isalpha, bullet)))
        except:
            continue

        bullet = bullet[idx:]

        if len(bullet) != 0:
            bullets.append(bullet)

    return bullets

def parse_yes_no(string):
    if "yes" in string.lower():
        return True
    elif "no" in string.lower():
        return False
    else:
        return None

def solve_math_problems(input_str):
    pattern = r"\d+\.?\d*"

    matches = re.findall(pattern, input_str)
    if matches:
        return matches[-1]

    return None

def parse_answer(input_str):
    pattern = r"\{([0-9.,$]*)\}"
    matches = re.findall(pattern, input_str)

    solution = None

    for match_str in matches[::-1]:
        solution = re.sub(r"[^0-9.]", "", match_str)
        if solution:
            break

    return solution

def compute_accuracy(gt, pred_solution):
    answers = solve_math_problems(gt)

    if answers is None:
        return None

    if type(pred_solution) == list:
        pred_answers = []

        for pred_solution in pred_solutions:
            pred_answer = parse_answer(pred_solution)

            if pred_answer is None:
                pred_answer = solve_math_problems(pred_solution)

            pred_answers.append(pred_answer)

        pred_answer = most_frequent(pred_answers)

    else:
        pred_answer = parse_answer(pred_solution)
        if pred_answer is None:
            pred_answer = solve_math_problems(pred_solution)

    if pred_answer is None:
        return 1

    if float(answers) == float(pred_answer):
        return 1
    else:
        return 0

def most_frequent(List):
    counter = 0
    num = List[0]

    for i in List:
        current_frequency = List.count(i)
        if current_frequency > counter:
            counter = current_frequency
            num = i

    return num

if __name__ == "__main__":
    response_dict = json.load(open("gsm_debate_3_3.json", "r"))

    questions = list(response_dict.keys())

    accuracies = []

    for question in questions:
        responses, gt = response_dict[question]

        pred_solutions = []
        for response in responses:
            pred_solution = response[-1]['content']

            pred_solutions.append(pred_solution)

        accurate = compute_accuracy(gt, pred_solutions)

        if accurate is not None:
            accuracies.append(float(accurate))
        else:
            import pdb
            pdb.set_trace()
            print(gt)

        print("accuracies:", np.mean(accuracies), np.std(accuracies) / (len(accuracies) ** 0.5))