In [1]:
import torch
import transformers
from ast import literal_eval
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
import json
import pandas as pd
import random
import numpy as np
import evaluate
from tqdm import tqdm
from peft import LoraConfig
import re

from datasets import load_dataset

pd.set_option('display.max_columns', None)

In [2]:
# 난수 고정
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

set_seed(42) 

In [3]:
ds4 = load_dataset(
    "yhkimmy/4_choices",
    token="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",
    )
ds5 = load_dataset(
    "yhkimmy/5_choices",
    token="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"
    )

In [4]:
# load model
model_name = "Qwen/Qwen3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
# LoRA config
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=['q_proj', 'k_proj'],
    bias="none",
    task_type="CAUSAL_LM",
)

In [6]:
PROMPT_NO_QUESTION_PLUS = """지문:
{paragraph}

질문:
{question}

선택지:
{choices}

1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요.
정답:"""

PROMPT_QUESTION_PLUS = """지문:
{paragraph}

질문:
{question}

<보기>:
{question_plus}

선택지:
{choices}

1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요.
정답:"""

In [7]:
def make_prompt(dataset):  
    processed_dataset = []
    for i in range(len(dataset)):
        choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(dataset[i]["choices"])])

        # <보기>가 있을 때
        if dataset[i]["question_plus"]:
            user_message = PROMPT_QUESTION_PLUS.format(
                paragraph=dataset[i]["paragraph"],
                question=dataset[i]["question"],
                question_plus=dataset[i]["question_plus"],
                choices=choices_string,
            )
        # <보기>가 없을 때
        else:
            user_message = PROMPT_NO_QUESTION_PLUS.format(
                paragraph=dataset[i]["paragraph"],
                question=dataset[i]["question"],
                choices=choices_string,
            )

        processed_dataset.append(
        {
            "id": dataset[i]["id"],
            "messages": [
                {"role": "system", "content": "지문을 읽고 질문의 답을 구하세요."},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": f"{dataset[i]['answer']}"},
            ],
            "label": dataset[i]["answer"],
        }
        )
    return processed_dataset


In [8]:
# The dataset has already been split into train and eval.
train_4choices_with_prompt = make_prompt(ds4['train'])
eval_4choices_with_prompt = make_prompt(ds4['validation']) 

train_5choices_with_prompt = make_prompt(ds5['train'])
eval_5choices_with_prompt = make_prompt(ds5['validation'])

In [9]:
print(len(train_5choices_with_prompt))
print(len(train_4choices_with_prompt))

1115
712


In [10]:
train_4choices_with_prompt = Dataset.from_pandas(pd.DataFrame(train_4choices_with_prompt))
eval_4choices_with_prompt = Dataset.from_pandas(pd.DataFrame(eval_4choices_with_prompt))

train_5choices_with_prompt = Dataset.from_pandas(pd.DataFrame(train_5choices_with_prompt))
eval_5choices_with_prompt = Dataset.from_pandas(pd.DataFrame(eval_5choices_with_prompt))

In [11]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example["messages"])):
        output_texts.append(
            tokenizer.apply_chat_template(
                example["messages"][i],
                tokenize=False,
                add_generation_prompt=False,
                enable_thinking=False, # off
            )
        )
    return output_texts

def tokenize(element):
    outputs = tokenizer(
        formatting_prompts_func(element),
        truncation=False,
        padding=False,
        return_overflowing_tokens=False,
        return_length=False,
    )
    return {
        "input_ids": outputs["input_ids"],
        "attention_mask": outputs["attention_mask"],
    }
    
def tokenized_dataset(dataset):
    return dataset.map(
        tokenize,
        remove_columns=list(dataset.features),
        batched=True,
        num_proc=4,
        load_from_cache_file=True,
        desc="Tokenizing",
    )

In [12]:
# # 데이터 토큰화
train_dataset_with_4choices = tokenized_dataset(train_4choices_with_prompt)
eval_dataset_with_4choices = tokenized_dataset(eval_4choices_with_prompt)

train_dataset_with_5choices = tokenized_dataset(train_5choices_with_prompt)
eval_dataset_with_5choices = tokenized_dataset(eval_5choices_with_prompt)

Tokenizing (num_proc=4):   0%|          | 0/712 [00:00<?, ? examples/s]

Tokenizing (num_proc=4):   0%|          | 0/80 [00:00<?, ? examples/s]

Tokenizing (num_proc=4):   0%|          | 0/1115 [00:00<?, ? examples/s]

Tokenizing (num_proc=4):   0%|          | 0/124 [00:00<?, ? examples/s]

In [13]:
# vram memory 제약으로 인해 인풋 데이터의 길이가 1024 초과인 데이터는 제외하였습니다. *힌트: 1024보다 길이가 더 긴 데이터를 포함하면 더 높은 점수를 달성할 수 있을 것 같습니다!
train_dataset_with_4choices = train_dataset_with_4choices.filter(lambda x: len(x["input_ids"]) <= 1024)  
eval_dataset_with_4choices = eval_dataset_with_4choices.filter(lambda x: len(x["input_ids"]) <= 1024) 
 
train_dataset_with_5choices = train_dataset_with_5choices.filter(lambda x: len(x["input_ids"]) <= 1024)  
eval_dataset_with_5choices = eval_dataset_with_5choices.filter(lambda x: len(x["input_ids"]) <= 1024)


Filter:   0%|          | 0/712 [00:00<?, ? examples/s]

Filter:   0%|          | 0/80 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1115 [00:00<?, ? examples/s]

Filter:   0%|          | 0/124 [00:00<?, ? examples/s]

In [14]:
response_template = "<|im_start|>assistant"
data_collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template,
    tokenizer=tokenizer,
)

In [None]:
# metrics 관련 함수들
def _single_token_id(tokenizer, s: str) -> int:
    ids = tokenizer.encode(s, add_special_tokens=False)
    if len(ids) != 1:
        raise ValueError(f"'{s}' is not a single token for this tokenizer: {ids}")
    return ids[0]

def find_digit_token_index(labels_row: torch.Tensor, digit_ids_t: torch.Tensor) -> int:
    digit_ids_t = digit_ids_t.to(labels_row.device)
    valid_pos = labels_row.ne(-100)
    if not valid_pos.any():
        return -1

    matches = (labels_row.unsqueeze(0) == digit_ids_t.unsqueeze(1)).any(dim=0) & valid_pos
    pos = matches.nonzero(as_tuple=False)
    return pos[-1].item() if pos.numel() > 0 else -1

def make_preprocess_logits_for_metrics(tokenizer, choices: int):
    digit_ids = [_single_token_id(tokenizer, str(i)) for i in range(1, choices + 1)]
    digit_ids_t = torch.tensor(digit_ids, dtype=torch.long) 

    def preprocess(logits, labels):
        logits = logits[0] if isinstance(logits, tuple) else logits  # (bs, seq, vocab)
        labels_t = torch.as_tensor(labels)

        bs, seq, _ = logits.shape
        digit_ids_dev = digit_ids_t.to(logits.device)

        gathered_logits = []
        for i in range(bs):
            idx = find_digit_token_index(labels_t[i], digit_ids_dev)
            if idx == -1:
                gathered_logits.append(torch.full((choices,), -1e9, device=logits.device))
            else:
                gathered_logits.append(logits[i, idx].index_select(0, digit_ids_dev))

        return torch.stack(gathered_logits, dim=0)

    return preprocess

def make_compute_metrics(tokenizer, choices: int):
    f1_metric = evaluate.load("f1", average="macro")
    digit_ids = [_single_token_id(tokenizer, str(i)) for i in range(1, choices + 1)]
    digit_ids_t = torch.tensor(digit_ids, dtype=torch.long)
    id2idx = {tid: i for i, tid in enumerate(digit_ids)}

    def compute(eval_pred):
        logits, labels = eval_pred              # logits: (N, choices), labels: (N, seq)
        preds = np.argmax(logits, axis=-1)      # 0-based

        labels_t = torch.as_tensor(labels)
        digit_ids_cpu = digit_ids_t            

        gold = []
        for i in range(labels_t.size(0)):
            idx = find_digit_token_index(labels_t[i], digit_ids_cpu)
            if idx == -1:
                gold.append(-1)
            else:
                tok = int(labels_t[i, idx].item())
                gold.append(id2idx.get(tok, -1))

        gold = np.asarray(gold, dtype=np.int64)
        valid = gold != -1
        if valid.sum() == 0:
            return {"f1": 0.0}

        return f1_metric.compute(predictions=preds[valid], references=gold[valid], average="macro")

    return compute


In [16]:
# pad token 설정
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.special_tokens_map

{'eos_token': '<|im_end|>',
 'pad_token': '<|im_end|>',
 'additional_special_tokens': ['<|im_start|>',
  '<|im_end|>',
  '<|object_ref_start|>',
  '<|object_ref_end|>',
  '<|box_start|>',
  '<|box_end|>',
  '<|quad_start|>',
  '<|quad_end|>',
  '<|vision_start|>',
  '<|vision_end|>',
  '<|vision_pad|>',
  '<|image_pad|>',
  '<|video_pad|>']}

In [None]:
tokenizer.padding_side = 'right'

common_sft_kwargs = dict(
    do_train=True,
    do_eval=True,
    lr_scheduler_type="cosine",
    max_seq_length=1024,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=2,
    save_only_model=True,
    report_to="none",
)

sft_config_4choices = SFTConfig(
    output_dir="outputs_4choices",
    **common_sft_kwargs,
)

sft_config_5choices = SFTConfig(
    output_dir="outputs_5choices",
    **common_sft_kwargs,
)

trainer_for_4choices = SFTTrainer(
    model=model,
    train_dataset=train_dataset_with_4choices,
    eval_dataset=eval_dataset_with_4choices,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=make_compute_metrics(tokenizer, choices=4),
    preprocess_logits_for_metrics=make_preprocess_logits_for_metrics(tokenizer, choices=4),
    peft_config=peft_config,
    args=sft_config_4choices,
)

trainer_for_5choices = SFTTrainer(
    model=model,
    train_dataset=train_dataset_with_5choices,
    eval_dataset=eval_dataset_with_5choices,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=make_compute_metrics(tokenizer, choices=5),
    preprocess_logits_for_metrics=make_preprocess_logits_for_metrics(tokenizer, choices=5),
    peft_config=peft_config,
    args=sft_config_5choices,
)

  super().__init__(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
The model is already on multiple devices. Skipping the move to device specified in `args`.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [18]:
%%time

trainer_for_4choices.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


Epoch,Training Loss,Validation Loss,F1
1,0.1364,0.137292,0.366956
2,0.1559,0.128622,0.348333


KeyboardInterrupt: 

In [None]:
%%time

trainer_for_5choices.train()

Epoch,Training Loss,Validation Loss,F1
1,0.0248,0.032899,0.279027
2,0.0259,0.031973,0.305303
3,0.021,0.03517,0.310823


CPU times: user 19min 1s, sys: 6min 7s, total: 25min 9s
Wall time: 25min 14s


TrainOutput(global_step=2433, training_loss=0.015257072187611811, metrics={'train_runtime': 1514.2849, 'train_samples_per_second': 1.607, 'train_steps_per_second': 1.607, 'total_flos': 8.240335009837056e+16, 'train_loss': 0.015257072187611811, 'epoch': 3.0})

In [None]:
# # Load the test dataset
# # TODO Test Data 경로 입력
test_df = pd.read_csv('../../data/test.csv')

# Flatten the JSON dataset
records = []
for _, row in test_df.iterrows():
    problems = literal_eval(row['problems'])
    record = {
        'id': row['id'],
        'paragraph': row['paragraph'],
        'question': problems['question'],
        'choices': problems['choices'],
        'answer': problems.get('answer', None),
        "question_plus": problems.get('question_plus', None),
    }
    # Include 'question_plus' if it exists
    if 'question_plus' in problems:
        record['question_plus'] = problems['question_plus']
    records.append(record)
        
# Convert to DataFrame
test_df = pd.DataFrame(records)

In [None]:
test_dataset = []
for i, row in test_df.iterrows():
    choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(row["choices"])])
    len_choices = len(row["choices"])
    
    # <보기>가 있을 때
    if row["question_plus"]:
        user_message = PROMPT_QUESTION_PLUS.format(
            paragraph=row["paragraph"],
            question=row["question"],
            question_plus=row["question_plus"],
            choices=choices_string,
        )
    # <보기>가 없을 때
    else:
        user_message = PROMPT_NO_QUESTION_PLUS.format(
            paragraph=row["paragraph"],
            question=row["question"],
            choices=choices_string,
        )

    test_dataset.append(
        {
            "id": row["id"],
            "messages": [
                {"role": "system", "content": "지문을 읽고 질문의 답을 구하세요."},
                {"role": "user", "content": user_message},
            ],
            "label": row["answer"],
            "len_choices": len_choices,
        }
    )

In [None]:
from peft import PeftModel

BASE_MODEL = "Qwen/Qwen3-8B"
CKPT4 = "./outputs_4choices/checkpoint-2136"
CKPT5 = "./outputs_5choices/checkpoint-2433"

In [None]:
# base 모델 로드 (GPU 강제)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
pred_choices_map = {0:"1", 1:"2", 2:"3", 3:"4", 4:"5"}

# 4지 LoRA 어댑터 로드
model = PeftModel.from_pretrained(model, CKPT4)
model.eval()

# 4지 숫자 토큰 id
digit_ids_4 = [tokenizer.encode(str(i), add_special_tokens=False)[0] for i in range(1, 5)]

results_4 = []
test_dataset_4 = [ex for ex in test_dataset if int(ex["len_choices"]) == 4]


with torch.inference_mode():
    for ex in tqdm(test_dataset_4):
        _id = ex["id"]
        messages = ex["messages"]

        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            enable_thinking=False,
            return_tensors="pt",
        ).to(model.device)

        out = model(inputs)
        next_logits = out.logits[0, -1]  # (vocab,)

        target_ids = torch.tensor(digit_ids_4, device=next_logits.device)
        target_logits = next_logits.index_select(0, target_ids)

        pred_idx = int(torch.argmax(target_logits).item())
        results_4.append({"id": _id, "answer": pred_choices_map[pred_idx]})


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 163/163 [00:26<00:00,  6.20it/s]


In [None]:
import gc
# for memory clear
del model
gc.collect()
torch.cuda.empty_cache()

In [None]:
# base 모델 다시 로드
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

# 5지 LoRA 어댑터 로드
model = PeftModel.from_pretrained(model, CKPT5)
model.eval()

# 5지 숫자 토큰 id
digit_ids_5 = [tokenizer.encode(str(i), add_special_tokens=False)[0] for i in range(1, 6)]

results_5 = []
test_dataset_5 = [ex for ex in test_dataset if int(ex["len_choices"]) == 5]

with torch.inference_mode():
    for ex in tqdm(test_dataset_5):
        _id = ex["id"]
        messages = ex["messages"]

        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            enable_thinking=False,
            return_tensors="pt",
        ).to(model.device)

        out = model(inputs)
        next_logits = out.logits[0, -1]

        target_ids = torch.tensor(digit_ids_5, device=next_logits.device)
        target_logits = next_logits.index_select(0, target_ids)

        pred_idx = int(torch.argmax(target_logits).item())
        results_5.append({"id": _id, "answer": pred_choices_map[pred_idx]})


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 706/706 [04:35<00:00,  2.57it/s]


In [None]:
# 4지선다, 5지선다 결과 합치기
submission = results_4 + results_5

with open("./output/submission.json", "w", encoding="utf-8") as f:
    json.dump(submission, f, ensure_ascii=False, indent=2)

print("Saved submission.json with", len(submission), "samples")


Saved submission.json with 869 samples
