In [None]:
import torch
import transformers
from ast import literal_eval
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import Dataset
import json

import pandas as pd
import random
import numpy as np
import evaluate
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from peft import AutoPeftModelForCausalLM, LoraConfig

pd.set_option('display.max_columns', None)

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
from unsloth.chat_templates import get_chat_template
from unsloth.chat_templates import train_on_responses_only
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

tokenizer

In [None]:
# 난수 고정
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

set_seed(42) # magic number :)

In [None]:
# Load the train dataset
# TODO Train Data 경로 입력
dataset = pd.read_csv('./data/train.csv') 

# Flatten the JSON dataset
records = []
for _, row in dataset.iterrows():
    problems = literal_eval(row['problems'])
    record = {
        'id': row['id'],
        'paragraph': row['paragraph'],
        'question': problems['question'],
        'choices': problems['choices'],
        'answer': problems.get('answer', None),
        "question_plus": problems.get('question_plus', None),
    }
    # Include 'question_plus' if it exists
    if 'question_plus' in problems:
        record['question_plus'] = problems['question_plus']
    records.append(record)
        
# Convert to DataFrame
df = pd.DataFrame(records)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 256,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    # target_modules = [ "o_proj",
    #                   "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 256,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 42,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
PROMPT_NO_QUESTION_PLUS = """지문:
{paragraph}

질문:
{question}

선택지:
{choices}

1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요.
정답:"""

PROMPT_QUESTION_PLUS = """지문:
{paragraph}

질문:
{question}

<보기>:
{question_plus}

선택지:
{choices}

1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요.
정답:"""

In [None]:
processed_dataset = []
for i in range(len(dataset)):
    choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(dataset[i]["choices"])])

    # <보기>가 있을 때
    if dataset[i]["question_plus"]:
        user_message = PROMPT_QUESTION_PLUS.format(
            paragraph=dataset[i]["paragraph"],
            question=dataset[i]["question"],
            question_plus=dataset[i]["question_plus"],
            choices=choices_string,
        )
    # <보기>가 없을 때
    else:
        user_message = PROMPT_NO_QUESTION_PLUS.format(
            paragraph=dataset[i]["paragraph"],
            question=dataset[i]["question"],
            choices=choices_string,
        )

    # chat message 형식으로 변환
    processed_dataset.append(
        {
            "id": dataset[i]["id"],
            "messages": [
                {"role": "system", "content": "지문을 읽고 질문의 답을 구하세요."},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": f"{dataset[i]['answer']}"}
            ],
            "label": dataset[i]["answer"],
        }
    )


In [None]:
processed_dataset[0]

In [None]:
processed_dataset = Dataset.from_pandas(pd.DataFrame(processed_dataset))
processed_dataset

In [None]:
tokenizer.chat_template

```
{'id': 'generation-for-nlp-425',
 'messages': [{'role': 'system', 'content': '지문을 읽고 질문의 답을 구하세요.'},
  {'role': 'user',
   'content': '지문:\n상소하여 아뢰기를 , “신이 좌참 찬 송준길이 올린 차자를 보았는데 , 상복(喪服) 절차에 대하여 논한 것이 신과는 큰 차이가 있었습니다 . 장자를 위하여 3년을 입는 까닭은 위로 ‘정체(正體)’가 되기 때문이고 또 전 중(傳重: 조상의 제사나 가문의 법통을 전함)하기 때문입니다 . …(중략) … 무엇보다 중요한 것은 할아버지와 아버지의 뒤를 이은 ‘정체’이지, 꼭 첫째이기 때문에 참 최 3년 복을 입는 것은 아닙니다 .”라고 하였다 .－현종실록 －ㄱ.기 사환국으로 정권을 장악하였다 .ㄴ.인 조반정을 주도 하여 집권세력이 되었다 .ㄷ.정조 시기에 탕평 정치의 한 축을 이루었다 .ㄹ.이 이와 성혼의 문인을 중심으로 형성되었다.\n\n질문:\n상소한 인물이 속한 붕당에 대한 설명으로 옳은 것만을 모두 고르면?\n\n선택지:\n1 - ㄱ, ㄴ\n2 - ㄱ, ㄷ\n3 - ㄴ, ㄹ\n4 - ㄷ, ㄹ\n\n1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요.\n정답:'},
  {'role': 'assistant', 'content': '2'}],
 'label': 2}
```
을 Llama-3 Chat Template에 맞게 파싱

In [None]:
from pprint import pprint
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example["messages"])):
        output_texts.append(
            tokenizer.apply_chat_template(
                example["messages"][i],
                tokenize=False,
            )
        )
    return output_texts

pprint(formatting_prompts_func(processed_dataset.select(range(1))))

일단 베이스라인 코드 대로면 우리는 llama 3의 챗 탬플릿인
<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n'
 '\n' 이후의 출력값만을 사용해야함 

베이스 라인 코드에서는 [-2]로 뒤에서 두번째 즉 답변만 추출했음

In [None]:
def tokenize(element):
    outputs = tokenizer(
        formatting_prompts_func(element),
        truncation=False,
        padding=False,
        return_overflowing_tokens=False,
        return_length=False,
    )
    return {
        "input_ids": outputs["input_ids"],
        "attention_mask": outputs["attention_mask"],
    }

# 데이터 토큰화
tokenized_dataset = processed_dataset.map(
    tokenize,
    remove_columns=list(processed_dataset.features),
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Tokenizing",
)

In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.25, seed=42)

train_dataset = tokenized_dataset['train']
eval_dataset = tokenized_dataset['test']
# 데이터 확인
print(tokenizer.decode(train_dataset[0]["input_ids"], skip_special_tokens=False))

In [None]:
import torch
import numpy as np
import evaluate

# ---------------------------------------------------
# 1. 메트릭 & 매핑 설정
# ---------------------------------------------------
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
candidate_labels = ["1", "2", "3", "4", "5"]
int_output_map = {label: i for i, label in enumerate(candidate_labels)}

# ---------------------------------------------------
# 2. 전처리 (Logits -> Argmax)
# ---------------------------------------------------
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple): logits = logits[0]
    return logits.argmax(dim=-1)

# ---------------------------------------------------
# 3. 메트릭 계산
# ---------------------------------------------------
def compute_metrics(eval_res):
    predictions, labels = eval_res
    
    # Numpy 변환
    if isinstance(predictions, torch.Tensor): predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor): labels = labels.cpu().numpy()

    final_preds, final_refs = [], []

    for i in range(len(labels)):
        # -100(패딩)이 아닌 유효한 인덱스만 추출
        valid_indices = np.where(labels[i] != -100)[0]
        if len(valid_indices) == 0: continue
            
        # 정답 위치(target_idx) 찾기: 기본은 마지막, EOS면 그 앞
        target_idx = valid_indices[-1]
        if labels[i][target_idx] == tokenizer.eos_token_id and len(valid_indices) > 1:
            target_idx = valid_indices[-2]
            
        # [핵심] 예측 위치(pred_idx)는 정답 위치보다 한 칸 앞(-1)
        pred_idx = max(0, target_idx - 1)

        # 값 추출 및 디코딩
        decoded_label = tokenizer.decode([labels[i][target_idx]], skip_special_tokens=True).strip()
        decoded_pred = tokenizer.decode([predictions[i][pred_idx]], skip_special_tokens=True).strip()
        if i%10==0:
            print(f"Decoded Label: {decoded_label}, Decoded Pred: {decoded_pred}")
        # 매핑 후 리스트 추가 (매핑 실패 시 -1)
        final_refs.append(int_output_map.get(decoded_label, -1))
        final_preds.append(int_output_map.get(decoded_pred, -1))

    return {
        "accuracy": acc_metric.compute(predictions=final_preds, references=final_refs)["accuracy"],
        "f1": f1_metric.compute(predictions=final_preds, references=final_refs, average="macro")["f1"]
    }

In [None]:
# pad token 설정 -100 으로 마스킹한거
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.special_tokens_map

In [None]:
from trl import SFTTrainer, SFTConfig
from transformers import EarlyStoppingCallback

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset, # Can set up evaluation!
            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
        compute_metrics=compute_metrics,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps = 8, # Use GA to mimic batch size!
        warmup_ratio=0.1,
        num_train_epochs = 3, # Set this for 1 full training run.
        eval_steps=10,
        metric_for_best_model="eval_f1",
        eval_strategy="steps",
        save_strategy="best",
        save_steps=10,
        save_total_limit=1,
        learning_rate = 2e-5, 
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 42,
        load_best_model_at_end=True,
        report_to = "none", # Use TrackIO/WandB etc
        # completion_only_loss=True,
    ),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
from unsloth import unsloth_train
trainer_stats = unsloth_train(trainer)