### 1순위 선택지와 2순위 선택지의 로짓 격차(Margin) 평균 비교

In [1]:
import argparse
import yaml
from pathlib import Path
from typing import Dict, Any
import re
from tqdm import tqdm 
from collections import Counter
from sklearn.model_selection import train_test_split
from typing import List

import pandas as pd
import torch
from transformers import AutoTokenizer

import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), '../..')) 
if project_root not in sys.path:
    sys.path.append(project_root)

from src.data.preprocessor import parse_problems_column, add_choices_len
from src.prompt.prompt_builder import PromptBuilder, PromptConfig
from src.training.model_loader import ModelConfig, load_model_inference

In [2]:
def create_configs(cfg_dict: Dict[str, Any]) -> tuple:
    model_cfg_dict = cfg_dict["model"].copy()
    model_cfg_dict["use_gradient_checkpointing"] = False
    model_cfg = ModelConfig(**model_cfg_dict)
    
    prompt_dict = cfg_dict["inference"]["prompt"]
    prompt_cfg = PromptConfig(
        policy=prompt_dict["policy"],
        mode="test",
        verbose=False
    )
    
    inference_cfg = cfg_dict.get("inference", {})
    
    return model_cfg, prompt_cfg, inference_cfg

In [3]:
with open("../../config.yaml", "r") as f:
        cfg_dict = yaml.safe_load(f)

In [4]:
model_cfg, prompt_cfg, inference_cfg = create_configs(cfg_dict)

### Validation data 재구성

In [5]:
data_path = '../../data/train.csv'

print(f"\nLoading data from {data_path}...")
df = pd.read_csv(data_path)
df = parse_problems_column(df)
df = add_choices_len(df)
print(f"Loaded {len(df)} rows")


Loading data from ../../data/train.csv...
Loaded 2031 rows


In [6]:
valid_ratio = cfg_dict["data"]["valid_ratio"]
seed = cfg_dict["data"]["seed"]

print(f"\nSplitting data (valid_ratio={valid_ratio}, seed={seed})...")
train_df, valid_df = train_test_split(
    df,
    test_size=valid_ratio,
    stratify=df["choices_len"],
    random_state=seed,
)
print(f"Train: {len(train_df)} rows")
print(f"Valid: {len(valid_df)} rows")


Splitting data (valid_ratio=0.1, seed=42)...
Train: 1827 rows
Valid: 204 rows


In [7]:
lsat_df = pd.read_csv("./review_autosave.csv")
lsat_df = lsat_df[lsat_df['keep'] == True]

print(f"lsat_df 데이터 {len(lsat_df)}개 준비 완료")

lsat_df 데이터 270개 준비 완료


In [8]:
import ast

# 1. 'choices' 컬럼의 문자열을 리스트 객체로 일괄 변환
# 혹시 모를 에러(이미 리스트인 경우 등)를 방지하기 위해 간단한 조건문을 추가합니다.
lsat_df['choices'] = lsat_df['choices'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else x
)

In [9]:
lsat_df['choices_len'] = lsat_df['choices'].apply(len)

In [10]:
lsat_df = lsat_df.drop(['group_id', 'keep'], axis=1)

In [11]:
eval_df = pd.concat([valid_df, lsat_df], axis=0, ignore_index=True)

In [12]:
len(eval_df)

474

In [13]:
eval_df['choices_len'].value_counts()

choices_len
5    394
4     80
Name: count, dtype: int64

In [14]:
prompt_cfg = PromptConfig(
        policy=cfg_dict["prompt"]["policy"],
        mode="test",
        verbose=False
    )

In [15]:
builder = PromptBuilder(prompt_cfg)
print("PromptBuilder ready!")


PromptBuilder ready!


In [16]:
def extract_answer(text: str, k: int) -> str:
    numbers = re.findall(rf'[1-{k}]', str(text))
    return numbers[-1] if numbers else "no"

### 모델 로드

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}\n")

Device: cuda



In [19]:
adapter_path = "../../outputs/reading/final_model"

print(f"Loading model from {adapter_path}...")
model = load_model_inference(model_cfg, "../../models/qwen3_14B_eng_aug2/final_model_from_serverA")
model.eval()
print("Model loaded successfully!\n")

Loading model from ../../outputs/reading/final_model...
Loading Base Model for Inference: Qwen/Qwen3-14B


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading LoRA Adapter from: ../../models/qwen3_14B_eng_aug2/final_model_from_serverA
Model loaded successfully!



In [20]:
print(f"Loading tokenizer from {model_cfg.model_name_or_path}...")
tokenizer = AutoTokenizer.from_pretrained(
    model_cfg.model_name_or_path,
    trust_remote_code=model_cfg.trust_remote_code,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token   

Loading tokenizer from Qwen/Qwen3-14B...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

## Test 시작

In [21]:
max_new_tokens = 30

In [22]:
def digit_only_probs_and_margin(step_logits: torch.Tensor, tokenizer, k: int) -> Dict[str, Any]:
    digit_tokens = [str(i) for i in range(1, k + 1)]
    digit_token_ids = []

    for digit in digit_tokens:
        encoded = tokenizer.encode(digit, add_special_tokens=False)
        if len(encoded) == 1:
            digit_token_ids.append(encoded[0])
        else:
            digit_token_ids.append(encoded[0])

    digit_logits = torch.tensor([step_logits[tid].item() for tid in digit_token_ids])
    digit_probs = torch.softmax(digit_logits, dim=-1)

    top2_values, top2_indices = torch.topk(digit_probs, k=min(2, k))

    digit_top1 = str(top2_indices[0].item() + 1)  # 1-indexed
    digit_top2 = str(top2_indices[1].item() + 1) if k >= 2 else "N/A"

    if k >= 2:
        digit_margin = (top2_values[0] - top2_values[1]).item()
    else:
        digit_margin = 0.0

    return {
        "digit_probs": digit_probs.tolist(),
        "digit_margin": digit_margin,
        "digit_top1": digit_top1,
        "digit_top2": digit_top2,
    }

In [23]:
def generate_for_row_with_retry(
    row_dict: Dict[str, Any],
    builder: PromptBuilder,
    tokenizer: AutoTokenizer,
    model: torch.nn.Module,
    device: str,
    generated_text: str,
    max_new_tokens: int = 30,
) -> Dict[str, Any]:
    """
    첫 번째 예측의 확률이 낮을 때, 재고려를 유도하는 프롬프트를 추가하여 재생성
    """
    output = builder.build_message(row_dict)
    messages = output["messages"]
    
    # 재시도 프롬프트 추가
    retry_assistant = {
        "role": "assistant",
        "content": generated_text
    }

    retry_message = {
        "role": "user",
        "content": "다시 한번 신중하게 생각해서 답변해주세요. 다른 접근 방식으로 다시 풀어보세요."
    }
    messages.append(retry_assistant)
    messages.append(retry_message)
    
    prompt_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer(
        prompt_text,
        return_tensors="pt",
        truncation=True,
        max_length=4096
    ).to(device)

    k = int(row_dict["choices_len"])
    input_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            return_dict_in_generate=True,
            output_scores=True,
        )

    generated_ids = outputs.sequences[0][input_len:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    # 끝에서 2번째 step의 logits 사용 (답변 digit이 나오는 위치)
    step_logits = outputs.scores[-2][0]

    top5_values, top5_indices = torch.topk(step_logits, k=5)
    probs_full = torch.softmax(step_logits, dim=-1)
    top5_candidates = []
    for rank, (logit_val, token_id) in enumerate(zip(top5_values, top5_indices)):
        top5_candidates.append({
            "rank": rank + 1,
            "token_id": token_id.item(),
            "token": tokenizer.decode([token_id.item()]),
            "logit": logit_val.item(),
            "prob_full_vocab": probs_full[token_id].item(),
        })

    digit_info = digit_only_probs_and_margin(step_logits, tokenizer, k)
    digit_margin = digit_info["digit_margin"]
    digit_probs = digit_info["digit_probs"]

    predicted_answer = extract_answer(generated_text, k=k)
    gold = str(row_dict["answer"])

    return {
        "id": row_dict["id"],
        "choices_len": k,
        "answer": gold,
        "predicted_answer": predicted_answer,
        "is_correct": predicted_answer == gold,
        "generated_text": generated_text,
        "is_retry": True,  # retry 여부 표시

        "top5_candidates": top5_candidates,

        "digit_probs_1_to_k": digit_probs,  
        "digit_margin_top1_minus_top2": digit_margin,
        "digit_top1": digit_info["digit_top1"],
        "digit_top2": digit_info["digit_top2"],

        "prompt": prompt_text,
    }

In [24]:
def generate_for_row_with_top5(
    row_dict: Dict[str, Any],
    builder: PromptBuilder,
    tokenizer: AutoTokenizer,
    model: torch.nn.Module,
    device: str,
    max_new_tokens: int = 30,
) -> Dict[str, Any]:
    output = builder.build_message(row_dict)
    messages = output["messages"]

    prompt_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer(
        prompt_text,
        return_tensors="pt",
        truncation=True,
        max_length=4096
    ).to(device)

    k = int(row_dict["choices_len"])
    input_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            return_dict_in_generate=True,
            output_scores=True,
        )

    generated_ids = outputs.sequences[0][input_len:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    # 끝에서 2번째 step의 logits 사용 (답변 digit이 나오는 위치)
    step_logits = outputs.scores[-2][0]

    top5_values, top5_indices = torch.topk(step_logits, k=5)
    probs_full = torch.softmax(step_logits, dim=-1)
    top5_candidates = []
    for rank, (logit_val, token_id) in enumerate(zip(top5_values, top5_indices)):
        top5_candidates.append({
            "rank": rank + 1,
            "token_id": token_id.item(),
            "token": tokenizer.decode([token_id.item()]),
            "logit": logit_val.item(),
            "prob_full_vocab": probs_full[token_id].item(),
        })

    digit_info = digit_only_probs_and_margin(step_logits, tokenizer, k)
    digit_margin = digit_info["digit_margin"]
    digit_probs = digit_info["digit_probs"]

    predicted_answer = extract_answer(generated_text, k=k)
    gold = str(row_dict["answer"])

    return {
        "id": row_dict["id"],
        "choices_len": k,
        "answer": gold,
        "predicted_answer": predicted_answer,
        "is_correct": predicted_answer == gold,
        "generated_text": generated_text,
        "is_retry": False,  # retry 여부 표시

        "top5_candidates": top5_candidates,

        "digit_probs_1_to_k": digit_probs,  
        "digit_margin_top1_minus_top2": digit_margin,
        "digit_top1": digit_info["digit_top1"],
        "digit_top2": digit_info["digit_top2"],

        "prompt": prompt_text,
    }


In [25]:
def process_normal(
    df: pd.DataFrame,
    builder: PromptBuilder,
    tokenizer: AutoTokenizer,
    model: torch.nn.Module,
    device: str,
    max_new_tokens: int,
    desc: str = "Processing",
) -> pd.DataFrame:
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=desc):
        row_dict = row.to_dict()
        result = generate_for_row_with_top5(
            row_dict=row_dict,
            builder=builder,
            tokenizer=tokenizer,
            model=model,
            device=device,
            max_new_tokens=max_new_tokens,
        )
        results.append(result)

    return pd.DataFrame(results)

In [26]:
print("\n" + "=" * 80)
print("Running inference on VALID set")
print("=" * 80)
valid_gen_df = process_normal(
    df=eval_df,
    builder=builder,
    tokenizer=tokenizer,
    model=model,
    device=device,
    max_new_tokens=max_new_tokens,
    desc="Valid Generation",
)

valid_acc = valid_gen_df['is_correct'].mean()
print(f"\nValid Accuracy: {valid_acc:.4f} ({valid_gen_df['is_correct'].sum()}/{len(valid_gen_df)})")


Running inference on VALID set


Valid Generation:   0%|          | 0/474 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Valid Generation: 100%|██████████| 474/474 [13:53<00:00,  1.76s/it]


Valid Accuracy: 0.9008 (427/474)





### 결과

In [27]:
# 정답 그룹과 오답 그룹 분리
correct_mask = valid_gen_df['is_correct'] == True
incorrect_mask = valid_gen_df['is_correct'] == False

# 2. 그룹별 digit_margin 평균 계산
avg_margin_correct = valid_gen_df.loc[correct_mask, 'digit_margin_top1_minus_top2'].mean()
avg_margin_incorrect = valid_gen_df.loc[incorrect_mask, 'digit_margin_top1_minus_top2'].mean()

# 3. 결과 출력
print(f"=== 분석 결과 ===")
print(f"맞춘 문제의 Margin 평균: {avg_margin_correct:.4f}")
print(f"틀린 문제의 Margin 평균: {avg_margin_incorrect:.4f}")
print(f"두 그룹 간의 격차: {avg_margin_correct - avg_margin_incorrect:.4f}")

# 4. (선택 사항) 통계적 유의성 확인을 위한 기초 통계량 확인
summary = valid_gen_df.groupby('is_correct')['digit_margin_top1_minus_top2'].describe()
print("\n=== 그룹별 상세 통계 ===")
print(summary)

=== 분석 결과 ===
맞춘 문제의 Margin 평균: 0.9148
틀린 문제의 Margin 평균: 0.5422
두 그룹 간의 격차: 0.3726

=== 그룹별 상세 통계 ===
            count      mean       std       min       25%       50%       75%  \
is_correct                                                                      
False        47.0  0.542171  0.300088  0.079724  0.262475  0.547935  0.830488   
True        427.0  0.914816  0.197331  0.038727  0.954650  0.994378  0.998993   

                 max  
is_correct            
False       0.998627  
True        0.999995  
