In [3]:
import os
import sys
from pathlib import Path

nb_dir = Path(os.getcwd())

project_root = nb_dir.parents[1]

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print("project_root:", project_root)
print("sys.path[0]:", sys.path[0])

project_root: /data/ephemeral/pro-nlp-generationfornlp-nlp-13
sys.path[0]: /data/ephemeral/pro-nlp-generationfornlp-nlp-13


In [4]:
import yaml
import pandas as pd

from transformers import AutoTokenizer


from src.train import load_config, create_configs
from src.data.data_loader import make_train_valid_dataset, DataConfig
from src.data.preprocessor import parse_problems_column, add_choices_len
from src.prompt.prompt_builder import PromptConfig
from src.data.tokenizer_wrapper import TokenizerConfig
from src.inference import create_configs as inference_create_configs
from src.inference import load_test_data, process_row, extract_answer


project_root = Path("/data/ephemeral/pro-nlp-generationfornlp-nlp-13")

with open(project_root / "config.yaml", "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

data_cfg = DataConfig(
    train_path=str(project_root / "data/train.csv"),
    test_path=str(project_root / "data/test.csv"),
    valid_ratio=cfg["data"]["valid_ratio"],  # 0.1
    seed=cfg["data"]["seed"],  # 42
    do_split=cfg["data"]["do_split"],  # True
)

prompt_cfg = PromptConfig(
    templates_dir=str(project_root / "src/prompt/templates"),
    verbose=cfg["prompt"]["verbose"],
    policy=cfg["prompt"]["policy"],
)

tokenize_cfg_train = TokenizerConfig(
    max_length=cfg["tokenizer"]["train"]["max_length"],  # 2048
    padding=cfg["tokenizer"]["train"]["padding"],  # False
    truncation=cfg["tokenizer"]["train"]["truncation"],  # True
    add_generation_prompt=cfg["tokenizer"]["train"]["add_generation_prompt"],  # False
)

tokenize_cfg_gen = TokenizerConfig(
    max_length=cfg["tokenizer"]["gen"]["max_length"],  # 2048
    padding=cfg["tokenizer"]["gen"]["padding"],  # "max_length"
    truncation=cfg["tokenizer"]["gen"]["truncation"],  # True
    add_generation_prompt=cfg["tokenizer"]["gen"]["add_generation_prompt"],  # True
)

model_name = cfg["model"]["model_name_or_path"]  # "Qwen/Qwen3-14B"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=cfg["model"]["trust_remote_code"],
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

ds = make_train_valid_dataset(
    data_cfg=data_cfg,
    prompt_cfg=prompt_cfg,
    tokenize_cfg_train=tokenize_cfg_train,
    tokenize_cfg_gen=tokenize_cfg_gen,
    tokenizer=tokenizer,
)

# Dataset
df = pd.read_csv(project_root / "data/train.csv")
df = parse_problems_column(df)
df = add_choices_len(df)
df.info()

# inference 수행


Build train messages:   0%|          | 0/1827 [00:00<?, ? examples/s]

Serialize train to text:   0%|          | 0/1827 [00:00<?, ? examples/s]

Build valid messages (teacher forcing):   0%|          | 0/204 [00:00<?, ? examples/s]

Serialize valid to text:   0%|          | 0/204 [00:00<?, ? examples/s]

Build valid_gen messages (prompt only):   0%|          | 0/204 [00:00<?, ? examples/s]

Serialize valid_gen to text (+meta):   0%|          | 0/204 [00:00<?, ? examples/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2031 entries, 0 to 2030
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             2031 non-null   object 
 1   paragraph      2031 non-null   object 
 2   question_plus  0 non-null      float64
 3   question       2031 non-null   object 
 4   choices        2031 non-null   object 
 5   answer         2031 non-null   int64  
 6   choices_len    2031 non-null   int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 111.2+ KB


In [7]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(
    df,
    test_size=data_cfg.valid_ratio,  # config.yaml에서 설정된 값 (기본 0.1)
    stratify=df["choices_len"],      # choices_len으로 stratify
    random_state=data_cfg.seed,       # config.yaml에서 설정된 seed (기본 42)
)

print(f"Original df: {len(df)}")
print(f"Train df: {len(train_df)}")
print(f"Valid df: {len(valid_df)}")
print(f"Valid ratio: {len(valid_df)/len(df):.4f}")

print("\nTrain choices_len distribution:")
print(train_df['choices_len'].value_counts().sort_index())
print("\nValid choices_len distribution:")
print(valid_df['choices_len'].value_counts().sort_index())

print("\n=== Comparison with actual training data ===")
print(f"ds['train'] size: {len(ds['train'])}")
print(f"ds['validation'] size: {len(ds['validation'])}")
print(f"ds['validation_gen'] size: {len(ds['validation_gen'])}")

print(f"\nSplit matches: {len(train_df) == len(ds['train']) and len(valid_df) == len(ds['validation'])}")

Original df: 2031
Train df: 1827
Valid df: 204
Valid ratio: 0.1004

Train choices_len distribution:
choices_len
4     712
5    1115
Name: count, dtype: int64

Valid choices_len distribution:
choices_len
4     80
5    124
Name: count, dtype: int64

=== Comparison with actual training data ===
ds['train'] size: 1827
ds['validation'] size: 204
ds['validation_gen'] size: 204

Split matches: True


In [None]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(
    df,
    test_size=data_cfg.valid_ratio,
    stratify=df["choices_len"],
    random_state=data_cfg.seed,
)

print(f"Train: {len(train_df)}, Valid: {len(valid_df)}")

# ===== Model Config 불러오기 =====
from src.training.model_loader import ModelConfig

model_cfg = ModelConfig(
    model_name_or_path=cfg["model"]["model_name_or_path"],
    use_4bit=cfg["model"]["use_4bit"],
    bnb_4bit_quant_type=cfg["model"]["bnb_4bit_quant_type"],
    bnb_4bit_use_double_quant=cfg["model"]["bnb_4bit_use_double_quant"],
    compute_dtype=cfg["model"]["compute_dtype"],
    device_map=cfg["model"]["device_map"],
    use_gradient_checkpointing=cfg["model"]["use_gradient_checkpointing"],
    trust_remote_code=cfg["model"]["trust_remote_code"],
)

print(f"\n=== Model Config ===")
print(f"Model: {model_cfg.model_name_or_path}")
print(f"4-bit: {model_cfg.use_4bit}")
print(f"Gradient Checkpointing: {model_cfg.use_gradient_checkpointing}")
# ===================================

import torch
from transformers import AutoTokenizer
from src.training.model_loader import load_model_inference

adapter_path = project_root / "models/qwen3_14B_eng/final_model" 
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"\nDevice: {device}")
print(f"Loading model from {adapter_path}...")

tokenizer = AutoTokenizer.from_pretrained(
    model_cfg.model_name_or_path,
    trust_remote_code=model_cfg.trust_remote_code,
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = load_model_inference(model_cfg, adapter_path)
model.eval()
print("Model loaded!")



from src.prompt.prompt_builder import PromptBuilder, PromptConfig

test_prompt_cfg = PromptConfig(
    policy=prompt_cfg.policy,
    mode="test",
    verbose=False
)
builder = PromptBuilder(test_prompt_cfg)

import re
from tqdm import tqdm

def extract_answer(text: str, k: int) -> str:
    """텍스트에서 1~k 범위의 답변 추출"""
    numbers = re.findall(rf'[1-{k}]', str(text))
    return numbers[-1] if numbers else "no"

def generate_for_row_with_top5(row_dict, max_new_tokens=30):
    """한 row에 대해 generation 수행 + 마지막 토큰의 top-5 후보 추출"""
    output = builder.build_message(row_dict)
    messages = output["messages"]
    
    prompt_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    
    inputs = tokenizer(
        prompt_text,
        return_tensors="pt",
        truncation=True,
        max_length=4096
    ).to(device)
    
    k = int(row_dict["choices_len"])
    input_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            return_dict_in_generate=True,
            output_scores=True,
        )
    
    generated_ids = outputs.sequences[0][input_len:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
    
    last_step_logits = outputs.scores[-2][0]
    
    top5_values, top5_indices = torch.topk(last_step_logits, k=5)
    
    top5_candidates = []
    probs = torch.softmax(last_step_logits, dim=-1)
    
    for rank, (logit_val, token_id) in enumerate(zip(top5_values, top5_indices)):
        token_text = tokenizer.decode([token_id.item()])
        prob_val = probs[token_id].item()
        
        top5_candidates.append({
            "rank": rank + 1,
            "token_id": token_id.item(),
            "token": token_text,
            "logit": logit_val.item(),
            "prob": prob_val,
        })
    
    last_token_id = generated_ids[-1].item()
    last_token = tokenizer.decode([last_token_id])
    last_token_logit = last_step_logits[last_token_id].item()
    last_token_prob = probs[last_token_id].item()
    
    predicted_answer = extract_answer(generated_text, k=k)
    gold = str(row_dict["answer"])
    
    return {
        "id": row_dict["id"],
        "choices_len": k,
        "true_answer": gold,
        "predicted_answer": predicted_answer,
        "generated_text": generated_text,
        "is_correct": predicted_answer == gold,
        
        "last_token": last_token,
        "last_token_id": last_token_id,
        "last_token_logit": last_token_logit,
        "last_token_prob": last_token_prob,
        
        "top5_candidates": top5_candidates,
        
        "prompt": prompt_text,
    }


In [65]:
train_gen_df['is_correct'].value_counts()

is_correct
True     1758
False      69
Name: count, dtype: int64

In [None]:

print("=" * 60)
print("Running Generation on TRAIN set")
print("=" * 60)

train_results = []
for idx, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Train Generation"):
    row_dict = row.to_dict()
    result = generate_for_row_with_top5(row_dict, max_new_tokens=raw_cfg["inference"]["max_new_tokens"])
    train_results.append(result)

train_gen_df = pd.DataFrame(train_results)


train_acc = train_gen_df['is_correct'].mean()
print(f"\nTrain Accuracy: {train_acc:.4f} ({train_gen_df['is_correct'].sum()}/{len(train_gen_df)})")


In [None]:
print("\n" + "=" * 60)
print("Running Generation on VALID set")
print("=" * 60)

valid_results = []
for idx, row in tqdm(valid_df.iterrows(), total=len(valid_df), desc="Valid Generation"):
    row_dict = row.to_dict()
    result = generate_for_row_with_top5(row_dict, max_new_tokens=raw_cfg["inference"]["max_new_tokens"])
    valid_results.append(result)

valid_gen_df = pd.DataFrame(valid_results)

# Valid 정확도
valid_acc = valid_gen_df['is_correct'].mean()
print(f"\nValid Accuracy: {valid_acc:.4f} ({valid_gen_df['is_correct'].sum()}/{len(valid_gen_df)})")

print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Valid Accuracy: {valid_acc:.4f}")
print(f"Generalization Gap: {(train_acc - valid_acc):.4f}")


Running Generation on VALID set


Valid Generation: 100%|██████████| 204/204 [05:35<00:00,  1.64s/it]


Valid Accuracy: 0.9020 (184/204)

SUMMARY
Train Accuracy: 0.9622
Valid Accuracy: 0.9020
Generalization Gap: 0.0603





In [75]:
valid_gen_df

Unnamed: 0,id,choices_len,true_answer,predicted_answer,generated_text,is_correct,last_token,last_token_id,last_token_logit,last_token_prob,top5_candidates,prompt
0,generation-for-nlp-1672,5,1,1,<think>\n\n</think>\n\n1,True,<|im_end|>,151645,7.050781,1.681572e-11,"[{'rank': 1, 'token_id': 16, 'token': '1', 'lo...",<|im_start|>system\n당신은 논리적인 **텍스트 분석 및 독해 전문가...
1,generation-for-nlp-2056,5,1,1,<think>\n\n</think>\n\n1,True,<|im_end|>,151645,6.699219,2.464974e-11,"[{'rank': 1, 'token_id': 16, 'token': '1', 'lo...",<|im_start|>system\n당신은 논리적인 **텍스트 분석 및 독해 전문가...
2,generation-for-nlp-1033,4,2,2,<think>\n\n</think>\n\n2,True,<|im_end|>,151645,4.714844,1.496562e-10,"[{'rank': 1, 'token_id': 17, 'token': '2', 'lo...",<|im_start|>system\nYou are a student solving ...
3,generation-for-nlp-1762,5,2,2,<think>\n\n</think>\n\n2,True,<|im_end|>,151645,6.496094,4.000352e-11,"[{'rank': 1, 'token_id': 17, 'token': '2', 'lo...",<|im_start|>system\n당신은 논리적인 **텍스트 분석 및 독해 전문가...
4,generation-for-nlp-1324,4,3,3,<think>\n\n</think>\n\n3,True,<|im_end|>,151645,5.332031,4.335726e-10,"[{'rank': 1, 'token_id': 18, 'token': '3', 'lo...",<|im_start|>system\nYou are a student solving ...
...,...,...,...,...,...,...,...,...,...,...,...,...
199,generation-for-nlp-1529,5,1,1,<think>\n\n</think>\n\n1,True,<|im_end|>,151645,6.003906,8.456181e-12,"[{'rank': 1, 'token_id': 16, 'token': '1', 'lo...",<|im_start|>system\n당신은 논리적인 **텍스트 분석 및 독해 전문가...
200,generation-for-nlp-662,4,4,4,<think>\n\n</think>\n\n4,True,<|im_end|>,151645,5.402344,3.656703e-10,"[{'rank': 1, 'token_id': 19, 'token': '4', 'lo...",<|im_start|>system\nYou are a student solving ...
201,generation-for-nlp-865,4,1,1,<think>\n\n</think>\n\n1,True,<|im_end|>,151645,5.269531,2.857477e-10,"[{'rank': 1, 'token_id': 16, 'token': '1', 'lo...",<|im_start|>system\nYou are a student solving ...
202,generation-for-nlp-966,4,4,2,<think>\n\n</think>\n\n2,False,<|im_end|>,151645,4.125000,7.920473e-11,"[{'rank': 1, 'token_id': 17, 'token': '2', 'lo...",<|im_start|>system\nYou are a student solving ...


In [73]:
train_gen_df.iloc[0,:].to_dict()

{'id': 'generation-for-nlp-2661',
 'choices_len': 5,
 'true_answer': '3',
 'predicted_answer': '3',
 'generated_text': '<think>\n\n</think>\n\n3',
 'is_correct': True,
 'last_token': '<|im_end|>',
 'last_token_id': 151645,
 'last_token_logit': 5.18359375,
 'last_token_prob': 3.1065129635354793e-11,
 'top5_candidates': [{'rank': 1,
   'token_id': 18,
   'token': '3',
   'logit': 29.375,
   'prob': 0.9964773058891296},
  {'rank': 2,
   'token_id': 17,
   'token': '2',
   'logit': 23.125,
   'prob': 0.0019236537627875805},
  {'rank': 3,
   'token_id': 20,
   'token': '5',
   'logit': 22.234375,
   'prob': 0.0007894659065641463},
  {'rank': 4,
   'token_id': 19,
   'token': '4',
   'logit': 21.9375,
   'prob': 0.0005866812425665557},
  {'rank': 5,
   'token_id': 16,
   'token': '1',
   'logit': 20.921875,
   'prob': 0.00021248187113087624}],
 'prompt': '<|im_start|>system\n당신은 논리적인 **텍스트 분석 및 독해 전문가**입니다.\n이 문제는 오직 **제공된 지문 내의 정보**만으로 풀어야 합니다.\n당신의 외부 배경지식을 배제하고, 철저하게 지문에 명시된 내용에 근거하여 판단하십

In [66]:
# 틀린 문제들만 하나씩 출력
for idx, row in train_gen_df[~train_gen_df['is_correct']].iterrows():
    print("=" * 80)
    print(f"ID: {row['id']}")
    print(f"Choices Length: {row['choices_len']}")
    print(f"True Answer: {row['true_answer']}")
    print(f"Predicted Answer: {row['predicted_answer']}")
    print(f"\n[Generated Text]")
    print(row['generated_text'])
    print(f"\n[Last Token Info]")
    print(f"  Token: {row['last_token']}")
    print(f"  Prob: {row['last_token_prob']:.4f}")
    print(f"\n[Top 5 Candidates]")
    for cand in row['top5_candidates']:
        print(f"  Rank {cand['rank']}: {cand['token']} (prob: {cand['prob']:.4f})")
    print(f"\n[Prompt]")
    print(row['prompt'])
    print("=" * 80)
    print("\n")


ID: generation-for-nlp-770
Choices Length: 4
True Answer: 2
Predicted Answer: 4

[Generated Text]
<think>

</think>

4

[Last Token Info]
  Token: <|im_end|>
  Prob: 0.0000

[Top 5 Candidates]
  Rank 1: 4 (prob: 0.3446)
  Rank 2: 1 (prob: 0.2948)
  Rank 3: 3 (prob: 0.1816)
  Rank 4: 2 (prob: 0.1788)
  Rank 5: 5 (prob: 0.0001)

[Prompt]
<|im_start|>system
You are a student solving multiple-choice questions. The problem consists of a passage, a question, and choices. Solve the problem step-by-step according to the guidelines below.

Guidelines:
1. Question Analysis: Define exactly what the question is asking for.
2. Choice Analysis: Analyze the choices and expand upon the concepts. For example, expand "World War II" to a broader concept like "War." If a choice does not appear in the passage, analyze it by including the expanded concept.
3. Background Knowledge Check: If the problem requires background knowledge, clearly identify what knowledge and specific parts are needed, and check whe

In [64]:
# 틀린 문제들만 하나씩 출력
for idx, row in valid_gen_df[~valid_gen_df['is_correct']].iterrows():
    print("=" * 80)
    print(f"ID: {row['id']}")
    print(f"Choices Length: {row['choices_len']}")
    print(f"True Answer: {row['true_answer']}")
    print(f"Predicted Answer: {row['predicted_answer']}")
    print(f"\n[Generated Text]")
    print(row['generated_text'])
    print(f"\n[Last Token Info]")
    print(f"  Token: {row['last_token']}")
    print(f"  Prob: {row['last_token_prob']:.4f}")
    print(f"\n[Top 5 Candidates]")
    for cand in row['top5_candidates']:
        print(f"  Rank {cand['rank']}: {cand['token']} (prob: {cand['prob']:.4f})")
    print(f"\n[Prompt]")
    print(row['prompt'])
    print("=" * 80)
    print("\n")


ID: generation-for-nlp-2392
Choices Length: 5
True Answer: 1
Predicted Answer: 2

[Generated Text]
<think>

</think>

2

[Last Token Info]
  Token: <|im_end|>
  Prob: 0.0000

[Top 5 Candidates]
  Rank 1: 2 (prob: 0.4954)
  Rank 2: 1 (prob: 0.4802)
  Rank 3: 3 (prob: 0.0147)
  Rank 4: 4 (prob: 0.0072)
  Rank 5: 5 (prob: 0.0024)

[Prompt]
<|im_start|>system
당신은 논리적인 **텍스트 분석 및 독해 전문가**입니다.
이 문제는 오직 **제공된 지문 내의 정보**만으로 풀어야 합니다.
당신의 외부 배경지식을 배제하고, 철저하게 지문에 명시된 내용에 근거하여 판단하십시오.<|im_end|>
<|im_start|>user
### 지문
대법관추천위원회가 다음달 임기를 마치는 민일영 대법관 후임으로 강형주 법원행정처 차장(56·사법연수원 13기), 성낙송 수원지방법원장(57·14기), 이기택 서울서부지방법원장(56·14기) 등 3명을 추천했다. 양승태 대법원장은 이들 중 한 명을 선정해 이르면 이번주에 대통령에게 임명을 제청할 예정이다. 대법원 구성 다양화를 요구해 온 재야 법조계는 “이번에도 ‘50대, 남성, 법관 출신’이라는 획일적인 틀을 벗어나지 못했다”고 비판했다.추천위는 4일 서울 서초동 대법원 청사에서 회의를 열어 법원 안팎에서 천거된 대상자들을 최종 심사한 뒤 이같이 결정했다. 추천받은 후보자들은 모두 서울대 법대 출신의 현직 법관으로 채워졌다.후보자들이 주력해온 분야와 업적 등은 조금씩 다르다. 강 차장은 전남 함평에서 태어나 광주제일고를 졸업했다. 서울중앙지법 근무 당시 영장전담과 형사합의부 재판장 등을 지낸 대표적인 형사 전문가다. 서울고법 근무 때는 민청학련 사건에 연루됐던 

In [None]:
valid_gen_df.iloc[0,:].to_dict()

{'id': 'generation-for-nlp-1672',
 'choices_len': 5,
 'true_answer': '1',
 'predicted_answer': '1',
 'generated_text': '<think>\n\n</think>\n\n1',
 'is_correct': True,
 'last_token': '<|im_end|>',
 'last_token_id': 151645,
 'last_token_logit': 7.05078125,
 'last_token_prob': 1.6815720690899205e-11,
 'top5_candidates': [{'rank': 1,
   'token_id': 16,
   'token': '1',
   'logit': 31.859375,
   'prob': 0.9998868703842163},
  {'rank': 2,
   'token_id': 17,
   'token': '2',
   'logit': 22.5,
   'prob': 8.614418766228482e-05},
  {'rank': 3,
   'token_id': 18,
   'token': '3',
   'logit': 20.515625,
   'prob': 1.1841939340229146e-05},
  {'rank': 4,
   'token_id': 15,
   'token': '0',
   'logit': 19.703125,
   'prob': 5.254828920442378e-06},
  {'rank': 5,
   'token_id': 19,
   'token': '4',
   'logit': 19.6875,
   'prob': 5.173360023036366e-06}],
 'prompt': '<|im_start|>system\n당신은 논리적인 **텍스트 분석 및 독해 전문가**입니다.\n이 문제는 오직 **제공된 지문 내의 정보**만으로 풀어야 합니다.\n당신의 외부 배경지식을 배제하고, 철저하게 지문에 명시된 내용에 근거하여 판단하