In [4]:
import torch
import numpy as np
import random
import pandas as pd
import os
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import vllm
from vllm import LLM, SamplingParams
from tqdm.auto import tqdm
from helpers import IOU, blend_intervals
from vllm.lora.request import LoRARequest

In [5]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=56)

### Указываем какие лора адаптеры нам нужны

In [7]:
lora_reqs = [
    LoRARequest(
        lora_name="peft_for_fining_mistakes",
        lora_int_id=i + 1,
        lora_path=f"./outputs/qwen-sft-fold-{i}",
        base_model_name="Qwen/Qwen3-4B-Instruct-2507"
    ) for i in range(5)
]

### Подгружаем ту же модельку через VLLM

In [8]:
llm = LLM(
    "Qwen/Qwen3-4B-Instruct-2507",
    max_num_seqs=64,
    max_model_len=81920,
    trust_remote_code=True,
    tensor_parallel_size=1,
    max_lora_rank=16,
    seed=56,
    enable_lora=True
)

tokenizer = llm.get_tokenizer()

INFO 10-22 15:48:52 [utils.py:233] non-default args: {'trust_remote_code': True, 'seed': 56, 'max_model_len': 81920, 'max_num_seqs': 64, 'disable_log_stats': True, 'enable_lora': True, 'model': 'Qwen/Qwen3-4B-Instruct-2507'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 10-22 15:48:53 [model.py:547] Resolved architecture: Qwen3ForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 10-22 15:48:54 [model.py:1510] Using max model len 81920
INFO 10-22 15:48:57 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 15:48:58 [core.py:644] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 15:48:58 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='Qwen/Qwen3-4B-Instruct-2507', speculative_config=None, tokenizer='Qwen/Qwen3-4B-Instruct-2507', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=81920, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False,

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 15:49:04 [default_loader.py:267] Loading weights took 1.74 seconds
[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 15:49:04 [punica_selector.py:19] Using PunicaWrapperGPU.
[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 15:49:05 [gpu_model_runner.py:2653] Model loading took 7.6684 GiB and 2.774170 seconds
[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 15:49:17 [backends.py:548] Using cache directory: /root/.cache/vllm/torch_compile_cache/72c553a21c/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 15:49:17 [backends.py:559] Dynamo bytecode transform time: 10.26 s
[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 15:49:22 [backends.py:164] Directly load the compiled graph(s) for dynamic shape from the cache, took 3.255 s
[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 15:49:24 [monitor.py:34] torch.compile takes 10.26 s in total
[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 19/19 [00:02<00:00,  7.95it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 11/11 [00:01<00:00,  7.42it/s]


[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 15:49:32 [gpu_model_runner.py:3480] Graph capturing finished in 5 secs, took 0.21 GiB
[1;36m(EngineCore_DP0 pid=37735)[0;0m INFO 10-22 15:49:32 [core.py:210] init engine (profile, create kv cache, warmup model) took 26.47 seconds
INFO 10-22 15:49:33 [llm.py:306] Supported_tasks: ['generate']


In [10]:
sampling_params = SamplingParams(
    temperature=0.6,              
    n=32,
    top_p=0.95,                    
    min_p=0.0, 
    top_k=20,
    skip_special_tokens=True,
    max_tokens=81920,
    seed=56,
)

In [12]:
SYSTEM_INSTRUCTIONS = (
    "You tag mistakes in student math solutions.\n"
    "- Output must be EXACTLY the student's solution text, with <mistake>...</mistake> tags around mistakes.\n"
    "- Do NOT add or remove any other text, lines, or spaces.\n"
    "- Do NOT add commentary or explanations.\n"
)

INFER_INSTRUCTIONS = SYSTEM_INSTRUCTIONS

def build_prompt(task: str, solution: str) -> str:
    return (
        f"{INFER_INSTRUCTIONS}\n\n"
        f"Problem:\n{task}\n\n"
        f"Student solution:\n{solution}\n\n"
        f"Tagged solution:\n"
    )

def get_answer_from_model(rows, lora_req):
    messages = [[
        {
            "role": "system",
            "content": SYSTEM_INSTRUCTIONS,
        },
        {
            "role": "user",
            "content": build_prompt(row['task'], row['solution']),
        }
    ] for row in rows]

    list_of_texts = [
        tokenizer.apply_chat_template(
            conversation=message,
            tokenize=False,
            add_generation_prompt=True,
        )
        for message in messages
    ]
    result = llm.generate(prompts=list_of_texts, sampling_params=sampling_params, lora_request=lora_req)
    result = result
    return result

### Запускае 

In [13]:
test_data = pd.read_csv('test_private_new_without_answer.csv')

In [15]:
test_data_rows = [x for _, x in test_data.iterrows()]

In [17]:
preds_test_i_model = []
for lora_req in lora_reqs:
    results_test = get_answer_from_model(test_data_rows, lora_req=lora_req)
    preds_test_i_model.append(results_test)

Adding requests:   0%|          | 0/450 [00:00<?, ?it/s]



Processed prompts:   0%|          | 0/14400 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/…

Adding requests:   0%|          | 0/450 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/14400 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/…

Adding requests:   0%|          | 0/450 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/14400 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/…

Adding requests:   0%|          | 0/450 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/14400 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/…

Adding requests:   0%|          | 0/450 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/14400 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/…

In [19]:
MISTAKE_PATTERN = re.compile(r"<mistake>(.*?)</mistake>", flags=re.IGNORECASE | re.DOTALL)

def get_intervals(x):
    spans = []
    bias = 19
    bias_pre = 0
    for m in re.finditer(MISTAKE_PATTERN, x):
        spans.append([m.span()[0] - bias_pre, m.span()[1] - bias])
        bias_pre += 19
        bias += 19
    return spans

def process_result(x, sol):
    first = get_intervals(x.outputs[0].text)
    intervals_all = []
    intervals_all.append(first)
    for i in range(1,32):
        second = get_intervals(x.outputs[i].text)
        intervals_all.append(second)
        first = blend_intervals(first, second)
        
    intervals_all_dict = []
    for span in first:
        intervals_dict = {}
        intervals_dict['text'] = sol[span[0]:span[1]]
        intervals_dict['start'] = span[0]
        intervals_dict['end'] = span[1]
        intervals_all_dict.append(intervals_dict)
    return intervals_all_dict, intervals_all

In [23]:
from collections import defaultdict
ious = []
fold_prediction = []
for fold in range(5):
    spans_predictions = []
    for item, sol in zip(preds_test_i_model[fold], test_data_rows):
        spans_predictions.append(process_result(item, sol['solution'])[1])
    fold_prediction.append(spans_predictions)

In [31]:
len(fold_prediction)

5

In [32]:
df_all_preds = []
for i in range(5):
    all_samples = fold_prediction[i]
    for j in range(450):
        all_ns = all_samples[j]
        for n in range(32):
            df_all_preds.append({'fold' : i, 'n' : n, 'sample' : j, 'answer' : all_ns[n]})

In [33]:
pd.DataFrame(df_all_preds).to_csv('llm_predictions_private.csv', index=False)