In [1]:
import random
import os
import argparse
import time
from vllm import LLM, SamplingParams
from datetime import datetime
from tqdm import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

from evaluate import evaluate
from utils import set_seed, load_jsonl, save_jsonl, construct_prompt
from parser import *
from trajectory import *
from data_loader import load_data
from python_executor import PythonExecutor
from model_utils import load_hf_lm_and_tokenizer, generate_completions
from math_eval import parse_args, set_seed, setup, main

import pandas as pd

INFO 05-15 12:23:35 [__init__.py:239] Automatically detected platform cuda.


In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument("--data_names", default="gsm8k,math", type=str)
parser.add_argument("--data_dir", default="./data", type=str)
parser.add_argument("--model_name_or_path", default="gpt-4", type=str)
parser.add_argument("--output_dir", default="./output", type=str)
parser.add_argument("--prompt_type", default="tool-integrated", type=str)
parser.add_argument("--split", default="test", type=str)
parser.add_argument("--num_test_sample", default=-1, type=int)  # -1 for full data
parser.add_argument("--seed", default=0, type=int)
parser.add_argument("--start", default=0, type=int)
parser.add_argument("--end", default=-1, type=int)
parser.add_argument("--temperature", default=0, type=float)
parser.add_argument("--n_sampling", default=1, type=int)
parser.add_argument("--top_p", default=1, type=float)
parser.add_argument("--max_tokens_per_call", default=2048, type=int)
parser.add_argument("--shuffle", action="store_true")
parser.add_argument("--use_vllm", action="store_true")
parser.add_argument("--save_outputs", action="store_true")
parser.add_argument("--overwrite", action="store_true")
parser.add_argument("--use_safetensors", action="store_true")
parser.add_argument("--num_shots", type=int, default=0)
parser.add_argument(
    "--apply_chat_template",
    action="store_true",
    help="Apply chat template to prompt.",
)
parser.add_argument("--pipeline_parallel_size", type=int, default=1)
parser.add_argument(
    "--adapt_few_shot",
    action="store_true",
    help="Few shot for multiple-choice questions, zero shot for others.",
)
args = parser.parse_args(args=["--use_vllm"])
args.top_p = (
    1 if args.temperature == 0 else args.top_p
)  # top_p must be 1 when using greedy sampling (vllm)

In [None]:
args.model_name_or_path = "/home/aiops/chenxw/hfmodels/Qwen2.5-Math-7B"
args.data_names = "math500"
args.output_dir = "math_eval-cot"
args.split = "test"
args.prompt_type = "qwen25-math-cot"
args.seed = 0
args.temperature = 0
args.n_sampling = 1
args.top_p = 1
args.start = 0
args.end = -1
args.save_outputs = True
args.apply_chat_template = False
args.overwrite = False
set_seed(args.seed)

Random seed set as 0


In [5]:
from vllm.distributed import (destroy_distributed_environment,
                              destroy_model_parallel)
import contextlib
import gc

def cleanup():
    destroy_model_parallel()
    destroy_distributed_environment()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    gc.collect()
    torch.cuda.empty_cache()

In [6]:
available_gpus = 2
df = pd.DataFrame()

In [13]:
try:
    del llm2
except:
    pass
cleanup()

In [12]:
# for step in range(10, 16, 5):
# step = 10

args.model_name_or_path = f"/home/aiops/chenxw/verl/checkpoints/verl_few_shot/Qwen2.5-Math-7B-true_pi1_aime/global_step_115_hf"
# args.model_name_or_path = "/home/aiops/chenxw/hfmodels/Qwen2.5-Math-7B"
llm2 = LLM(
            model=args.model_name_or_path,
            tensor_parallel_size=available_gpus // args.pipeline_parallel_size,
            pipeline_parallel_size=args.pipeline_parallel_size,
            trust_remote_code=True,
        )
tokenizer = None
if args.apply_chat_template:
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path, trust_remote_code=True
    )

data_list = args.data_names.split(",")
results = []
for data_name in data_list:
    results.append(main(llm2, tokenizer, data_name, args))

# ids = []
# scores = []

# for sample in results[0][1]:
#     ids.append(sample["idx"])
#     scores.append(sum(sample["score"]) / args.n_sampling)

# df[step] = pd.Series(scores, ids)

INFO 05-15 12:27:24 [config.py:600] This model supports multiple tasks: {'embed', 'score', 'classify', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 05-15 12:27:24 [config.py:1600] Defaulting to use mp for distributed inference
INFO 05-15 12:27:24 [config.py:1780] Chunked prefill is enabled with max_num_batched_tokens=8192.


INFO 05-15 12:27:25 [core.py:61] Initializing a V1 LLM engine (v0.8.3) with config: model='/home/aiops/chenxw/verl/checkpoints/verl_few_shot/Qwen2.5-Math-7B-true_pi1_aime/global_step_115_hf', speculative_config=None, tokenizer='/home/aiops/chenxw/verl/checkpoints/verl_few_shot/Qwen2.5-Math-7B-true_pi1_aime/global_step_115_hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_mod

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(VllmWorker rank=1 pid=21176)[0;0m INFO 05-15 12:27:45 [loader.py:447] Loading weights took 16.03 seconds
[1;36m(VllmWorker rank=0 pid=21158)[0;0m INFO 05-15 12:27:45 [loader.py:447] Loading weights took 15.99 seconds
[1;36m(VllmWorker rank=1 pid=21176)[0;0m INFO 05-15 12:27:46 [gpu_model_runner.py:1273] Model loading took 7.1148 GiB and 16.496002 seconds
[1;36m(VllmWorker rank=0 pid=21158)[0;0m INFO 05-15 12:27:46 [gpu_model_runner.py:1273] Model loading took 7.1148 GiB and 16.536179 seconds
[1;36m(VllmWorker rank=1 pid=21176)[0;0m INFO 05-15 12:27:57 [backends.py:416] Using cache directory: /home/aiops/chenxw/.cache/vllm/torch_compile_cache/7be0024083/rank_1_0 for vLLM's torch.compile
[1;36m(VllmWorker rank=1 pid=21176)[0;0m INFO 05-15 12:27:57 [backends.py:426] Dynamo bytecode transform time: 10.92 s
[1;36m(VllmWorker rank=0 pid=21158)[0;0m INFO 05-15 12:27:57 [backends.py:416] Using cache directory: /home/aiops/chenxw/.cache/vllm/torch_compile_cache/7be0024083/r

0it [00:00, ?it/s]


-------------------- Epoch 0
Unsolved samples: 0


Evaluate: 100%|██████████| 500/500 [00:03<00:00, 156.68it/s]


{'num_samples': 500, 'num_scores': 500, 'timeout_samples': 0, 'empty_samples': 73, 'acc': np.float64(52.4)}


In [110]:
df = pd.DataFrame()

In [71]:
df[step] = pd.Series(scores, ids)

In [75]:
df.to_csv('MATH500Pass1.csv')

In [117]:
score = [11, 22, 33]
idx = [1, 2, 3]

In [122]:
df[3] = pd.Series(score, idx, dtype=np.float64)

In [123]:
df.head()

Unnamed: 0,1,2,3
1,11.0,11.0,11.0
2,22.0,22.0,22.0
3,55.0,55.0,33.0


In [None]:
pd.Series(scores, idx, dtype=np.float64, copy=True)

0      1.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
495    1.0
496    1.0
497    0.0
498    0.0
499    0.0
Length: 500, dtype: float64

In [128]:
scores5 = scores.copy()