In [None]:
!pip install transformers
!pip install vllm

Collecting vllm
  Using cached vllm-0.5.0.post1-cp310-cp310-manylinux1_x86_64.whl (130.2 MB)
Collecting ninja (from vllm)
  Using cached ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
Collecting fastapi (from vllm)
  Using cached fastapi-0.111.0-py3-none-any.whl (91 kB)
Collecting openai (from vllm)
  Using cached openai-1.35.6-py3-none-any.whl (327 kB)
Collecting uvicorn[standard] (from vllm)
  Using cached uvicorn-0.30.1-py3-none-any.whl (62 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Using cached prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl (19 kB)
Collecting tiktoken>=0.6.0 (from vllm)
  Using cached tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
Collecting lm-format-enforcer==0.10.1 (from vllm)
  Using cached lm_format_enforcer-0.10.1-py3-none-any.whl (42 kB)
Collecting outlines>=0.0.43 (from vllm)
  Using cached outlines-0.0.46-py3-none-any.whl (101 kB)
Collecting ray>=2.9 (

In [None]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import torch

model_path = "deepseek-ai/deepseek-math-7b-rl"
#model_path = "/kaggle/input/open-math-mistral"

tokenizer = AutoTokenizer.from_pretrained(model_path)

llm = LLM(model=model_path,
          dtype='auto',
          enforce_eager=True, # setting to False might help get more speed
          gpu_memory_utilization=0.99,
          swap_space=3,
          max_model_len=2048,
          kv_cache_dtype="fp8_e5m2",
          tensor_parallel_size=torch.cuda.device_count())
          #trust_remote_code=True,

ModuleNotFoundError: No module named 'vllm'

In [None]:
import re
import gc
import time
def extract_last_code_block(text):
    if 'mistral' in model_path.lower():
        matches = re.findall(r"<llm-code>(.*?)</llm-code>", text, re.DOTALL)
    else:
        matches = re.findall(r"```python\n(.*?)\n```", text, re.DOTALL)
    return matches[-1] if matches else None

def parse_final_answer(latex_output):
    matches = re.findall(r'\\boxed\{(.*?)\}', latex_output, re.DOTALL)
    if matches:
        try:
            return int(abs(float(matches[-1]))) % 1000
        except:
            return None
    else:
        try:
            return parse_final_answer_no_boxed(latex_output)
        except:
            return None


def parse_final_answer_no_boxed(text):
    # Regex pattern to find numbers preceded by "answer" within the last 50 characters
    pattern = r'(\$\d+\$).*?answer|answer.*?(\$\d+\$)'

    # Find all matches in the text
    matches = re.findall(pattern, text.lower()[-100:])  # Only search in the last 50 characters

    if matches:
        # Extract just the numbers, stripping the dollar signs
        answers = [match[0].strip('$') if match[0] else match[1].strip('$') for match in matches]
        return abs(int(answers[-1])) % 1000
    else:
        return None

def manage_resources():
    for _ in range(5):
        torch.cuda.empty_cache()
        gc.collect()
        time.sleep(0.2)

In [None]:
import io
import contextlib
import subprocess
import sys
import os

def run_code_once_deepseek(code):
    if code is None:
        return "Error parsing code, try again"
    # Write the code to a temporary file
    with open('temp_code.py', 'w') as fout:
        fout.write(code)

    try:
        # Prepare the command to run the code with a timeout
        command = f'timeout 5 {sys.executable} temp_code.py'

        # Run the command and capture output
        result = subprocess.run(command, shell=True, stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE, text=True, timeout=5)

        # Combine stdout and stderr
        output = result.stdout + result.stderr

        # Check if the process timed out
        if result.returncode == 124:
            output += "\nExecution timed out after 5 seconds."

    except subprocess.TimeoutExpired:
        output = "Execution timed out after 5 seconds."

    except Exception as e:
        output = f"An error occurred: {str(e)}"

    finally:
        # Clean up the temporary file
        if os.path.exists('temp_code.py'):
            os.remove('temp_code.py')

    return output

# Example usage
code_1 = """
print("Hello, world!")
x = 1 / 0  # This will raise a ZeroDivisionError
"""
code_2 = """
import math
print(math.sqrt(4))
"""
code_3 = """
for i in range(10):
    print(i)
"""

code_4 = """
# timeout error
import time
time.sleep(10)
"""

print("Test 1:\n", run_code_once_deepseek(code_1))
print("Test 2:\n", run_code_once_deepseek(code_2))
print("Test 3:\n", run_code_once_deepseek(code_3))
#print("Test 4:\n", run_code_once_deepseek(code_4))

Test 1:
 Hello, world!
Traceback (most recent call last):
  File "/content/temp_code.py", line 3, in <module>
    x = 1 / 0  # This will raise a ZeroDivisionError
ZeroDivisionError: division by zero

Test 2:
 2.0

Test 3:
 0
1
2
3
4
5
6
7
8
9



In [None]:
sampling_params_deepseek_seed = SamplingParams(temperature=0.85,
                                 top_p=0.95,
                                 max_tokens=200,
                                stop_token_ids=[tokenizer.eos_token_id],
                                              stop = '```output',
                                    include_stop_str_in_output = False)
sampling_params_deepseek_solve = SamplingParams(temperature=0.3,
                                 top_p=0.7,
                                 max_tokens=1028,
                                stop_token_ids=[tokenizer.eos_token_id],
                                              stop = '```output',
                                    include_stop_str_in_output = False)

In [None]:
# dataset init
import pandas as pd

df = pd.read_csv("hf://datasets/qq8933/AIME_1983_2024/AIME_Dataset_1983_2024.csv")

In [None]:
# set random seed
import random
random.seed(69)
df_sample_50 = df.sample(50, random_state=69)
df_sample_50['Answer'] = df_sample_50['Answer'].astype(int)

df_sample_10 = df.sample(10, random_state=69)
df_sample_10['Answer'] = df_sample_10['Answer'].astype(int)

In [None]:
df_sample_10.head()

Unnamed: 0,ID,Year,Problem Number,Question,Answer,Part
833,2021-I-4,2021,4,Find the number of ways $66$ identical coins c...,331,I
745,2018-I-6,2018,6,Let $N$ be the number of complex numbers $z$ w...,440,I
417,2006-II-10,2006,10,Seven teams play a soccer tournament in which ...,831,II
205,1998-12,1998,12,"Let $ABC$ be equilateral , and $D, E,$ and $F$...",83,
822,2020-II-8,2020,8,Define a sequence recursively by $f_1(x)=|x-1|...,101,II


In [None]:
from collections import Counter

def llm_solve_test(problem, ground_truth):
  """Returns most common and all answers"""
  #ground_truth = 199
  ground_truth_hit_count = 0
  answers = []
  code_strategy_seed_prompt = f"""Below is a math problem you are to solve (positive numerical answer):
\'{problem}\'
To accomplish this, first determine a sympy-based approach for solving the problem by listing each step to take
and what functions need to be called in each step. Be clear so even an idiot can follow your instructions.\n\n Approach:\n"""

  code_excecution_prompt = '\nUser: Continue solving the problem until positive numerical answer is found, put your final answer within \\boxed{}.\n Assistant:'

  request_outputs_seed = llm.generate([code_strategy_seed_prompt]*64,
              sampling_params=sampling_params_deepseek_seed,
              )

  prompts_to_rerun = []
  for request_output in request_outputs_seed:
      if request_output.outputs[0].stop_reason == '```output':
        code = extract_last_code_block(request_output.outputs[0].text)
        output = run_code_once_deepseek(code)[-200:] # to strip long errors
        prompts_to_rerun.append(request_output.prompt + request_output.outputs[0].text + '```output\n' + output + '```' + code_excecution_prompt)
      else:
        prompts_to_rerun.append(request_output.prompt + request_output.outputs[0].text + code_excecution_prompt)

  request_outputs_rerun = llm.generate(prompts_to_rerun,
              sampling_params=sampling_params_deepseek_solve,
              )

  prompts_to_rerun_2 = []
  for request_output in request_outputs_rerun:
      if request_output.outputs[0].finish_reason != 'length':
        if request_output.outputs[0].stop_reason == '```output':
          code = extract_last_code_block(request_output.outputs[0].text)
          output = run_code_once_deepseek(code)[-200:] # to strip long errors
          prompts_to_rerun_2.append(request_output.prompt + request_output.outputs[0].text + '```output\n' + output + '```')
        else:
          final_answer = parse_final_answer(request_output.outputs[0].text)
          if final_answer != None:
            answers.append(final_answer)
          if final_answer == ground_truth:
            ground_truth_hit_count += 1
  print("ground_truth_hit_count = ", ground_truth_hit_count)

  request_outputs_rerun_2 = llm.generate(prompts_to_rerun_2,
              sampling_params=sampling_params_deepseek_solve,
              )

  prompts_to_rerun_3 = []
  for request_output in request_outputs_rerun_2:
      if request_output.outputs[0].finish_reason != 'length':
        if request_output.outputs[0].stop_reason == '```output':
          code = extract_last_code_block(request_output.outputs[0].text)
          output = run_code_once_deepseek(code)[-200:] # to strip long errors
          prompts_to_rerun_3.append(request_output.prompt + request_output.outputs[0].text + '```output\n' + output + '```')
        else:
          final_answer = parse_final_answer(request_output.outputs[0].text)
          if final_answer != None:
            answers.append(final_answer)
          if final_answer == ground_truth:
            ground_truth_hit_count += 1
  print("ground_truth_hit_count = ", ground_truth_hit_count)
  manage_resources()
    #final check
  try:
      final_final_answer = int(int(Counter(answers).most_common(1)[0][0]) % 1000)
  except:
      final_final_answer = 'Answer not found'


  return final_final_answer, answers, prompts_to_rerun, prompts_to_rerun_2, prompts_to_rerun_3, request_outputs_seed

In [None]:
llm_answers = []
all_llm_answers_list = []
score = 0
for i, row in df_sample_10.iterrows():
  problem = row['Question']
  ground_truth = int(row['Answer'])
  llm_answer = -1
  all_llm_answers = [-1]
  try:
    llm_answer, all_llm_answers, prompts_to_rerun, prompts_to_rerun_2, prompts_to_rerun_3, request_outputs_seed = llm_solve_test(problem, ground_truth)
    if llm_answer == ground_truth:
      score += 1
  except Exception as e:
    print(e)
    continue

  llm_answers.append(llm_answer)
  all_llm_answers_list.append(all_llm_answers)
print("Score:", score)

In [None]:
# save as pickle
import pickle
with open('llm_answers.pkl', 'wb') as f:
    pickle.dump(all_llm_answers_list, f)

In [None]:
for i, a in zip(all_llm_answers_list, list(df_sample_10['Answer'].values)):
  if a in i:
    print(i)
    print(a)

[441, None, 157, 394, 121, 331, 490, 898, 394, 331, 64, 331, 331, 331, 439, 64, 232, 79, 221, 22, 110, 190, 32, 160, None, 331, 331, 20, 80, 208, 420, 81, 331, 820, 331, 15, 22, 14, 80, 898, 898, None, 396, 515, 278, 441, 146, 210, 376, 776, 331, 331, None, 450, 331, 331, 11, 92, 208, 436, 331, 341, 16, 821, 331, 331, 896, 146, 341, 331, 777, 331, 439, 331, 215, 331, 160, 432, 570, 80, 131, 211, 210, 331, 232, 331, 331, 80, 331, 189, 231, 331, 752, 704, 341, 278]
331
[360, 0, 0, 2, 240, 180, 0, 0, 4, 200, 599, 0, 9, 0, None, 0, 300, None, 0, 840, 120, 0, 720, 0, 0, 0, 36, 3, None, 0, 0, 360, 720, 20, 0, 4, 120, 0, 0, 0, 0, 0, 5, 0, 8, 0, 440, 720, 672, 48, 120, 297, 0, 0, 0, 0, 0, 40, 4, 360, 0, 360, 3, 0, 720, 800, 0, 44, 0, 0, 0, 440, 360, 0, 0, 100, 0, 0, 2, 360, 352, 0, 300, 120, 119, 4, 360, 16, 144, 360]
440
[2, 1, 41, 1, 1, 285, 15, 1, 66, 1, 113, 21, 1, 1, 41, 7, 0, 1, 2, 1, 1, 31, 587, 1, 1, 240, 41, 41, 1, 3, 1, 0, 11, 41, 1, 881, 196, 1, 1, 789, 560, 16, 196, 8, 36, 280, 1, 

# tldr
## we can find at least one solution for 50% of the problems (n = 50)
## we can select correct one in X % of cases

In [None]:
# add timeout for some code runs
# add timeout for the whole notebook

In [None]:
# now using open-math
#model_path = "deepseek-ai/deepseek-math-7b-rl"
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import torch
model_path = "nvidia/OpenMath-Mistral-7B-v0.1-hf"

tokenizer = AutoTokenizer.from_pretrained(model_path)

llm = LLM(model=model_path,
          dtype='half',
          enforce_eager=True, # setting to False might help get more speed
          gpu_memory_utilization=0.99,
          swap_space=3,
          max_model_len=2048,
          kv_cache_dtype="fp8_e5m2",
          tensor_parallel_size=torch.cuda.device_count())
          #trust_remote_code=True,

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

INFO 06-27 19:04:34 config.py:400] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor
INFO 06-27 19:04:34 llm_engine.py:161] Initializing an LLM engine (v0.5.0.post1) with config: model='nvidia/OpenMath-Mistral-7B-v0.1-hf', speculative_config=None, tokenizer='nvidia/OpenMath-Mistral-7B-v0.1-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=fp8_e5m2, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=nvidia/OpenMath-Mistral-7B-v0.1-hf)


generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

INFO 06-27 19:04:38 selector.py:141] Cannot use FlashAttention-2 backend for FP8 KV cache.
INFO 06-27 19:04:38 selector.py:51] Using XFormers backend.
INFO 06-27 19:04:40 selector.py:141] Cannot use FlashAttention-2 backend for FP8 KV cache.
INFO 06-27 19:04:40 selector.py:51] Using XFormers backend.
INFO 06-27 19:04:41 weight_utils.py:218] Using model weights format ['*.safetensors']


model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

INFO 06-27 19:06:23 model_runner.py:160] Loading model weights took 13.4966 GB
INFO 06-27 19:06:25 gpu_executor.py:83] # GPU blocks: 8192, # CPU blocks: 3072


In [None]:
openmath_stop_word = '</llm-code>'
sampling_params_openmath = SamplingParams(temperature=0.85,
                                 top_p=0.95,
                                 max_tokens=200,
                                stop_token_ids=[tokenizer.eos_token_id],
                                              stop = openmath_stop_word,
                                    include_stop_str_in_output = True)

In [None]:
from collections import Counter

def llm_solve_test_openmath(problem, ground_truth):
  """Returns most common and all answers"""
  #ground_truth = 199
  ground_truth_hit_count = 0
  answers = []
  system =   """You're an expert Python programmer and mathematician.
  Help the user to solve this problem using code when necessary.
  Make sure to put the answer (and only answer) inside \boxed{}."""
  context = "The answer is a positive integer or 0"
  user = f"{problem}\n\n{context}"
  prompt_template = f"System:\n{system}\n\nUser:\n{user}Assistant:\n"

  request_outputs_seed = llm.generate([prompt_template]*64,
              sampling_params=sampling_params_openmath,
              )

  prompts_to_rerun = []
  for request_output in request_outputs_seed:
    if request_output.outputs[0].finish_reason != 'length':
      if request_output.outputs[0].stop_reason == openmath_stop_word:
        code = extract_last_code_block(request_output.outputs[0].text)
        output = run_code_once_deepseek(code)[-200:] # to strip long errors
        prompts_to_rerun.append(request_output.prompt + request_output.outputs[0].text + "<llm-code-output>" + output + "</llm-code-output>")
      else:
        final_answer = parse_final_answer(request_output.outputs[0].text)
        if final_answer != None:
            answers.append(final_answer)
        if final_answer == ground_truth:
          ground_truth_hit_count += 1
  print("ground_truth_hit_count = ", ground_truth_hit_count)
  request_outputs_rerun = llm.generate(prompts_to_rerun,
              sampling_params=sampling_params_openmath,
              )

  prompts_to_rerun_2 = []
  for request_output in request_outputs_rerun:
      if request_output.outputs[0].finish_reason != 'length':
        if request_output.outputs[0].stop_reason == openmath_stop_word:
          code = extract_last_code_block(request_output.outputs[0].text)
          output = run_code_once_deepseek(code)[-200:] # to strip long errors
          prompts_to_rerun_2.append(request_output.prompt + request_output.outputs[0].text + "<llm-code-output>" + output + "</llm-code-output>")
        else:
          final_answer = parse_final_answer(request_output.outputs[0].text)
          if final_answer != None:
            answers.append(final_answer)
          if final_answer == ground_truth:
            ground_truth_hit_count += 1
  print("ground_truth_hit_count = ", ground_truth_hit_count)

  request_outputs_rerun_2 = llm.generate(prompts_to_rerun_2,
              sampling_params=sampling_params_openmath,
              )

  prompts_to_rerun_3 = []
  for request_output in request_outputs_rerun_2:
      if request_output.outputs[0].finish_reason != 'length':
        if request_output.outputs[0].stop_reason == openmath_stop_word:
          code = extract_last_code_block(request_output.outputs[0].text)
          output = run_code_once_deepseek(code)[-200:] # to strip long errors
          prompts_to_rerun_3.append(request_output.prompt + request_output.outputs[0].text + "<llm-code-output>" + output + "</llm-code-output>")
        else:
          final_answer = parse_final_answer(request_output.outputs[0].text)
          if final_answer != None:
            answers.append(final_answer)
          if final_answer == ground_truth:
            ground_truth_hit_count += 1
  print("ground_truth_hit_count = ", ground_truth_hit_count)
  manage_resources()
    #final check
  try:
      final_final_answer = int(int(Counter(answers).most_common(1)[0][0]) % 1000)
  except:
      final_final_answer = 'Answer not found'
  print(Counter(answers).most_common(2))


  return final_final_answer, answers, prompts_to_rerun, prompts_to_rerun_2, prompts_to_rerun_3, request_outputs_seed

In [None]:
import time
import json
llm_answers = []
all_llm_answers_list = []
score = 0
for i, row in df_sample_10.iterrows():
  problem = row['Question']
  ground_truth = int(row['Answer'])
  llm_answer = -1
  all_llm_answers = [-1]
  try:
    llm_answer, all_llm_answers, prompts_to_rerun, prompts_to_rerun_2, prompts_to_rerun_3, request_outputs_seed = llm_solve_test_openmath(problem, ground_truth)
    if llm_answer == ground_truth:
      score += 1
  except Exception as e:
    print(e)
    continue

  llm_answers.append(llm_answer)
  all_llm_answers_list.append(all_llm_answers)

import json
from datetime import datetime

data = {
    "llm_answer": [],
    "all_llm_answers": [],
    "prompts_to_rerun": [],
    "prompts_to_rerun_2": [],
    "prompts_to_rerun_3": [],
    "request_outputs_seed": []
}

time_code = datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f"llm_data_{time_code}.json"

with open(f"/mnt/data/{file_name}", 'w') as file:
    json.dump(data, file)

print("Score:", score)

Processed prompts: 100%|██████████| 64/64 [00:17<00:00,  3.57it/s, est. speed input: 428.36 toks/s, output: 654.26 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 29/29 [00:09<00:00,  2.99it/s, est. speed input: 917.47 toks/s, output: 77.14 toks/s]


ground_truth_hit_count =  2


Processed prompts: 0it [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


ground_truth_hit_count =  2
[(760, 6), (462, 2)]


Processed prompts: 100%|██████████| 64/64 [00:18<00:00,  3.55it/s, est. speed input: 458.38 toks/s, output: 597.35 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 35/35 [00:16<00:00,  2.18it/s, est. speed input: 672.99 toks/s, output: 148.81 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 7/7 [00:12<00:00,  1.85s/it, est. speed input: 252.88 toks/s, output: 38.93 toks/s]


ground_truth_hit_count =  0
[(2, 4), (0, 4)]


Processed prompts: 100%|██████████| 64/64 [00:20<00:00,  3.10it/s, est. speed input: 665.70 toks/s, output: 586.79 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 13/13 [00:13<00:00,  1.07s/it, est. speed input: 364.40 toks/s, output: 82.34 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 4/4 [00:12<00:00,  3.13s/it, est. speed input: 180.76 toks/s, output: 20.71 toks/s]


ground_truth_hit_count =  0
[(3, 3), (373, 1)]


Processed prompts: 100%|██████████| 64/64 [00:21<00:00,  2.99it/s, est. speed input: 800.00 toks/s, output: 501.63 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 28/28 [00:16<00:00,  1.74it/s, est. speed input: 758.41 toks/s, output: 60.33 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.84s/it, est. speed input: 259.31 toks/s, output: 15.88 toks/s]


ground_truth_hit_count =  0
[(9, 5), (94, 3)]


Processed prompts: 100%|██████████| 64/64 [00:19<00:00,  3.36it/s, est. speed input: 520.33 toks/s, output: 647.32 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 13/13 [00:13<00:00,  1.07s/it, est. speed input: 339.49 toks/s, output: 105.97 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 3/3 [00:12<00:00,  4.09s/it, est. speed input: 144.99 toks/s, output: 18.33 toks/s]


ground_truth_hit_count =  0
[(32, 4), (33, 2)]


Processed prompts: 100%|██████████| 64/64 [00:18<00:00,  3.51it/s, est. speed input: 477.26 toks/s, output: 582.75 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 38/38 [00:15<00:00,  2.44it/s, est. speed input: 799.61 toks/s, output: 64.26 toks/s] 


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 2/2 [00:10<00:00,  5.37s/it, est. speed input: 111.77 toks/s, output: 18.05 toks/s]


ground_truth_hit_count =  0
[(251, 3), (0, 3)]


Processed prompts: 100%|██████████| 64/64 [00:17<00:00,  3.58it/s, est. speed input: 529.16 toks/s, output: 508.60 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 58/58 [00:19<00:00,  2.94it/s, est. speed input: 1066.76 toks/s, output: 123.63 toks/s]


ground_truth_hit_count =  14


Processed prompts: 100%|██████████| 14/14 [00:11<00:00,  1.27it/s, est. speed input: 722.09 toks/s, output: 33.57 toks/s]


ground_truth_hit_count =  16
[(8, 30), (24, 16)]


Processed prompts: 100%|██████████| 64/64 [00:20<00:00,  3.19it/s, est. speed input: 606.58 toks/s, output: 609.52 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 15/15 [00:14<00:00,  1.07it/s, est. speed input: 399.55 toks/s, output: 69.92 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it, est. speed input: 509.05 toks/s, output: 14.81 toks/s]


ground_truth_hit_count =  0
[(1, 4), (50, 2)]


Processed prompts: 100%|██████████| 64/64 [00:25<00:00,  2.51it/s, est. speed input: 1106.48 toks/s, output: 460.86 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 20/20 [00:13<00:00,  1.48it/s, est. speed input: 894.51 toks/s, output: 49.63 toks/s] 


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 3/3 [00:04<00:00,  1.44s/it, est. speed input: 503.54 toks/s, output: 25.20 toks/s]


ground_truth_hit_count =  0
[(6, 7), (0, 3)]


Processed prompts: 100%|██████████| 64/64 [00:21<00:00,  3.03it/s, est. speed input: 733.66 toks/s, output: 565.31 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 18/18 [00:14<00:00,  1.20it/s, est. speed input: 521.43 toks/s, output: 89.47 toks/s]


ground_truth_hit_count =  0


Processed prompts: 100%|██████████| 3/3 [00:08<00:00,  2.99s/it, est. speed input: 212.20 toks/s, output: 18.64 toks/s]


ground_truth_hit_count =  0
[(4, 4), (1, 2)]


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/llm_data_20240627_191726.json'

In [None]:
prompts_to_rerun_2

['System:\nYou\'re an expert Python programmer and mathematician.\n  Help the user to solve this problem using code when necessary.\n  Make sure to put the answer (and only answer) inside \x08oxed{}.\n\nUser:\nSeven teams play a soccer tournament in which each team plays every other team exactly once. No ties occur, each team has a $50\\%$ chance of winning each game it plays, and the outcomes of the games are independent. In each game, the winner is awarded a point and the loser gets 0 points. The total points are accumulated to decide the ranks of the teams. In the first game of the tournament, team $A$ beats team $B.$ The probability that team $A$ finishes with more points than team $B$ is $m/n,$ where $m$ and $n$ are relatively prime positive integers. Find $m+n.$\n\nThe answer is a positive integer or 0Assistant:\n Let\'s write down an equation for the probability that team $A$ finishes with more points than team $B$ and then solve for $m+n$.\n<llm-code>\n# probability that team A