In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from vllm import LLM, SamplingParams
from vllm.sampling_params import StructuredOutputsParams
import json_repair
import json
from typing import List, Dict
from tqdm import tqdm
from prompts import Output, prompt_m1_concept_node_validity_ordinal, prompt_m1_concept_triplet_accuracy_ordinal

In [3]:
def batch_llm_inference(llm , messages_list: List[List[Dict]], schema: dict, temperature: float = 0.7, max_tokens: int = 2048) -> List[dict]:
    """
    Perform batch inference with structured output.
    
    Args:
        llm: vLLM model
        messages_list: List of message sequences (each is a list of message dicts)
        schema: JSON schema for structured output
        temperature: Sampling temperature
        
    Returns:
        List of parsed JSON responses
    """
    sampling_params = SamplingParams(
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=0.95,
        structured_outputs=StructuredOutputsParams(json=schema),
    )  
    responses = [r.outputs[0].text for r in llm.chat(messages_list, sampling_params, chat_template_kwargs={"include_reasoning": False})]
    
    # Parse all responses
    parsed_responses = []
    for response in responses:
        try:
            parsed = json_repair.loads(response)
            parsed_responses.append(parsed)
        except Exception as e:
            print(f"Error parsing response: {e}")
            print(f"Response text: {response.outputs[0].text}")
            parsed_responses.append(None)
    
    return parsed_responses

In [None]:
llm = LLM(model="openai/gpt-oss-120b", tensor_parallel_size=1, max_model_len=16384, gpu_memory_utilization=0.9, max_num_seqs=400)


INFO 02-04 21:36:42 [utils.py:261] non-default args: {'max_model_len': 16384, 'max_num_seqs': 400, 'disable_log_stats': True, 'model': 'openai/gpt-oss-120b'}
INFO 02-04 21:36:43 [model.py:541] Resolved architecture: GptOssForCausalLM


Parse safetensors files:   0%|          | 0/15 [00:00<?, ?it/s]

INFO 02-04 21:36:43 [model.py:1561] Using max model len 16384
INFO 02-04 21:36:46 [scheduler.py:226] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 02-04 21:36:46 [config.py:314] Overriding max cuda graph capture size to 1024 for performance.
INFO 02-04 21:36:46 [vllm.py:624] Asynchronous scheduling is enabled.
[0;36m(EngineCore_DP0 pid=47836)[0;0m INFO 02-04 21:36:47 [core.py:96] Initializing a V1 LLM engine (v0.15.0) with config: model='openai/gpt-oss-120b', speculative_config=None, tokenizer='openai/gpt-oss-120b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutpu

[0;36m(EngineCore_DP0 pid=47836)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`


[0;36m(EngineCore_DP0 pid=47836)[0;0m INFO 02-04 21:36:56 [cuda.py:364] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'TRITON_ATTN')
[0;36m(EngineCore_DP0 pid=47836)[0;0m INFO 02-04 21:36:56 [mxfp4.py:164] Using Triton backend


Loading safetensors checkpoint shards:   0% Completed | 0/15 [00:00<?, ?it/s]


In [6]:
from prompts import Output, prompt_m1_concept_node_validity_ordinal, prompt_m1_concept_triplet_accuracy_ordinal

In [None]:
schema = Output.model_json_schema()
