In [None]:
import argparse
import yaml
from pathlib import Path
from typing import Dict, Any
import re
from tqdm import tqdm 

import pandas as pd
import torch
from transformers import AutoTokenizer

import sys
import os

project_root = os.path.abpath(os.path.join(os.getcwd(), '../..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.data.preprocessor import parse_problems_column, add_choices_len
from src.prompt.prompt_builder import PromptBuilder, PromptConfig
from src.training.model_loader import ModelConfig, load_model_inference

In [None]:
def create_configs(cfg_dict: Dict[str, Any]) -> tuple:
    model_cfg_dict = cfg_dict["model"].copy()
    model_cfg_dict["use_gradient_checkpointing"] = False
    model_cfg = ModelConfig(**model_cfg_dict)
    
    prompt_dict = cfg_dict["inference"]["prompt"]
    prompt_cfg = PromptConfig(
        policy=prompt_dict["policy"],
        mode="test",
        verbose=False
    )
    
    inference_cfg = cfg_dict.get("inference", {})
    
    return model_cfg, prompt_cfg, inference_cfg

In [None]:
with open("../../config.yaml", "r") as f:
    cfg_dict = yaml.safe_load(f)

model_cfg, prompt_cfg, inference_cfg = create_configs(cfg_dict)

adapter_path = inference_cfg["adapter_path"]
output_path = inference_cfg["output_path"]
output_logits_path = inference_cfg["output_logits_path"]
test_data_path = inference_cfg["test_data_path"]
max_new_tokens = inference_cfg.get("max_new_tokens", 100)

In [None]:
def load_test_data(test_path: Path) -> pd.DataFrame:
    """Load and preprocess test data."""
    test_df = pd.read_csv(test_path)
    test_df = parse_problems_column(test_df)
    test_df = add_choices_len(test_df)
    return test_df

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}\n")

print(f"Loading test data from {test_data_path}...")
test_df = load_test_data(test_data_path)
print(f"Loaded {len(test_df)} rows\n")

In [None]:
test_df = test_df.head(5)

In [None]:
print(f"Loading tokenizer from {model_cfg.model_name_or_path}...")
tokenizer = AutoTokenizer.from_pretrained(
    model_cfg.model_name_or_path,
    trust_remote_code=model_cfg.trust_remote_code,
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
print(f"Loading model from {adapter_path}...")
model = load_model_inference(model_cfg, adapter_path)
model.eval()
print("Model loaded successfully!\n")

In [None]:
builder = PromptBuilder(prompt_cfg)
print("PromptBuilder ready!\n")

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/130 [00:00<?, ?B/s]

In [None]:
row_dict = test_df[:1].to_dict()

k = int(row_dict["choices_len"])

output = builder.build_message(row_dict)
messages = output["messages"]

prompt_text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False,
)

inputs = tokenizer(
    prompt_text,
    return_tensors="pt",
    truncation=True,
    max_length=4096
).to(device)

with torch.no_grad():
    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        return_dict_in_generate=True,
        output_scores=True,
    )