# SOLAR 실행 코드

## Import Necessary Libraries

In [1]:
import torch
import transformers
from ast import literal_eval
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig, EarlyStoppingCallback, TrainingArguments
from datasets import Dataset
import json
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import evaluate
import bitsandbytes as bnb
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from peft import AutoPeftModelForCausalLM, LoraConfig, prepare_model_for_kbit_training, get_peft_model

pd.set_option('display.max_columns', None) 

In [None]:
!pip install --upgrade transformers bitsandbytes accelerate

In [2]:
# 난수 고정
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

set_seed(42) # magic number :)

### Load Data

In [3]:
# Load the train dataset
# TODO Train Data 경로 입력
train_dataset = pd.read_csv('../../data/train-splited.csv')
dev_dataset = pd.read_csv('../../data/dev-splited.csv')

# Flatten the JSON dataset
train_records = [] 
dev_records = [] 

for _, row in train_dataset.iterrows():
    problems = literal_eval(row['problems'])
    record = {
        'id': row['id'],
        'paragraph': row['paragraph'],
        'question': problems['question'],
        'choices': problems['choices'],
        'answer': problems.get('answer', None),
        "question_plus": problems.get('question_plus', None),
    }
    # Include 'question_plus' if it exists
    if 'question_plus' in problems:
        record['question_plus'] = problems['question_plus']
    train_records.append(record)
    
for _, row in dev_dataset.iterrows():
    problems = literal_eval(row['problems'])
    record = {
        'id': row['id'],
        'paragraph': row['paragraph'],
        'question': problems['question'],
        'choices': problems['choices'],
        'answer': problems.get('answer', None),
        "question_plus": problems.get('question_plus', None),
    }
    # Include 'question_plus' if it exists
    if 'question_plus' in problems:
        record['question_plus'] = problems['question_plus']
    dev_records.append(record)
        
# Convert to DataFrame
train_df = pd.DataFrame(train_records)
dev_df = pd.DataFrame(dev_records)

## Model Training

In [4]:
# 본인의 Huggingface auth token 입력
## Jupyter lab에서 로그인 하는 textbox가 나오지 않을 경우, terminal에서 로그인 하실 수 있습니다.
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `assignment_2_persona` has been saved to /data/ephemeral/home/.cache/huggingface/stored_tokens
Your token has been saved to /data/ephemeral/home/.cache/huggingface/token
Login successful.
The current active token is: `assignment_2_persona`


모델과 토크나이저를 불러옵니다.

In [11]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
)

model = AutoModelForCausalLM.from_pretrained(
    "davidkim205/komt-solar-10.7b-sft-v5", 
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    trust_remote_code=True, 
    device_map="auto",
    offload_folder="offload",
)

tokenizer = AutoTokenizer.from_pretrained(
    "davidkim205/komt-solar-10.7b-sft-v5",
    trust_remote_code=True
)

tokenizer.chat_template = """{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:' + message['content']+'

'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:
' + message['content']+'

'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:
'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:
' }}{% endif %}{% endfor %}"""

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

### Prepare LoRA

In [5]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=['q_proj', 'k_proj'],
    bias="none",
    task_type="CAUSAL_LM",
)

### Data Processing

In [6]:
train_dataset = Dataset.from_pandas(train_df) 
dev_dataset = Dataset.from_pandas(dev_df)

In [7]:
PROMPT_NO_QUESTION_PLUS = """지문:
{paragraph}

질문:
{question}

선택지:
{choices}

1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요.
정답:"""

PROMPT_QUESTION_PLUS = """지문:
{paragraph}

질문:
{question}

<보기>:
{question_plus}

선택지:
{choices}

1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요.
정답:"""

In [None]:
train_dataset 
dev_dataset

In [8]:
train_processed_dataset = []
dev_processed_dataset = []

for i in range(len(train_dataset)):
    choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(train_dataset[i]["choices"])])

    # <보기>가 있을 때
    if train_dataset[i]["question_plus"]:
        user_message = PROMPT_QUESTION_PLUS.format(
            paragraph=train_dataset[i]["paragraph"],
            question=train_dataset[i]["question"],
            question_plus=train_dataset[i]["question_plus"],
            choices=choices_string,
        )
    # <보기>가 없을 때
    else:
        user_message = PROMPT_NO_QUESTION_PLUS.format(
            paragraph=train_dataset[i]["paragraph"],
            question=train_dataset[i]["question"],
            choices=choices_string,
        )

    # chat message 형식으로 변환
    train_processed_dataset.append(
        {
            "id": train_dataset[i]["id"],
            "messages": [
                {"role": "system", "content": "지문을 읽고 답을 말해주세요."},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": f"{train_dataset[i]['answer']}"}
            ],
            "label": train_dataset[i]["answer"],
        }
    )
    
for i in range(len(dev_dataset)):
    choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(dev_dataset[i]["choices"])])

    # <보기>가 있을 때
    if dev_dataset[i]["question_plus"]:
        user_message = PROMPT_QUESTION_PLUS.format(
            paragraph=dev_dataset[i]["paragraph"],
            question=dev_dataset[i]["question"],
            question_plus=dev_dataset[i]["question_plus"],
            choices=choices_string,
        )
    # <보기>가 없을 때
    else:
        user_message = PROMPT_NO_QUESTION_PLUS.format(
            paragraph=dev_dataset[i]["paragraph"],
            question=dev_dataset[i]["question"],
            choices=choices_string,
        )

    # chat message 형식으로 변환
    dev_processed_dataset.append(
        {
            "id": dev_dataset[i]["id"],
            "messages": [
                {"role": "system", "content": "지문을 읽고 답을 말해주세요."},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": f"{dev_dataset[i]['answer']}"}
            ],
            "label": dev_dataset[i]["answer"],
        }
    )


In [9]:
train_processed_dataset = Dataset.from_pandas(pd.DataFrame(train_processed_dataset))
dev_processed_dataset = Dataset.from_pandas(pd.DataFrame(dev_processed_dataset))

train_processed_dataset 
dev_processed_dataset

Dataset({
    features: ['id', 'messages', 'label'],
    num_rows: 406
})

In [12]:

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example["messages"])):
        output_texts.append(
            tokenizer.apply_chat_template(
                example["messages"][i],
                tokenize=False,
            )
        )
    return output_texts

def tokenize(element):
    outputs = tokenizer(
        formatting_prompts_func(element),
        truncation=True,
        padding=False,
        max_length=2048,
        return_overflowing_tokens=False,
        return_length=False,
    ) 
    
    return {
        "input_ids": outputs["input_ids"],
        "attention_mask": outputs["attention_mask"],
    }

# 데이터 토큰화
train_tokenized_dataset = train_processed_dataset.map(
    tokenize,
    remove_columns=list(train_processed_dataset.features),
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Tokenizing",
)

dev_tokenized_dataset = dev_processed_dataset.map(
    tokenize,
    remove_columns=list(dev_processed_dataset.features),
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Tokenizing",
)

Tokenizing (num_proc=4):   0%|          | 0/1623 [00:00<?, ? examples/s]

Tokenizing (num_proc=4):   0%|          | 0/406 [00:00<?, ? examples/s]

In [13]:
# 데이터 분리
train_tokenized_dataset = train_tokenized_dataset.filter(lambda x: len(x["input_ids"]) <= 1024)
dev_tokenized_dataset = dev_tokenized_dataset.filter(lambda x: len(x["input_ids"]) <= 1024)

train_dataset = train_tokenized_dataset
eval_dataset = dev_tokenized_dataset

Filter:   0%|          | 0/1623 [00:00<?, ? examples/s]

Filter:   0%|          | 0/406 [00:00<?, ? examples/s]

Completion 부분만 학습하기 위한 data collator 설정

- 텍스트 중 response_template 까지는 ignore_index 로 loss 계산에서 제외
- 텍스트 중 response_template 이후는 학습에 포함 (정답 + eos 토큰)

In [14]:
def find_subsequence(sequence, subsequence):
    """시퀀스에서 하위 시퀀스의 시작 인덱스를 찾는 헬퍼 함수."""
    for idx in range(len(sequence) - len(subsequence) + 1):
        if sequence[idx:idx + len(subsequence)] == subsequence:
            return idx
    return -1  # 찾을 수 없을 때

def custom_data_collator(features, tokenizer, response_template):
    for feature in features:
        input_ids = feature['input_ids']
        labels = [-100] * len(input_ids)  # 모든 라벨을 기본적으로 무시(-100)로 초기화
        
        # response_template을 토큰화
        response_template_tokens = tokenizer.encode(response_template, add_special_tokens=False) 
        
        # input_ids 내에서 response_template의 시작 인덱스 찾기
        template_start_idx = find_subsequence(input_ids, response_template_tokens)
        start_token_idx = template_start_idx + len(response_template_tokens)
        
        # response_template 이후 텍스트 추출
        extracted_output_tokens = input_ids[start_token_idx:]  # 응답 부분만 추출
        extracted_output = tokenizer.decode(extracted_output_tokens, skip_special_tokens=True)
        
        # 추출된 출력 내용 출력
        print("Extracted Output:", extracted_output)
        
        if template_start_idx != -1:
            # response_template 이후 응답의 시작 토큰 인덱스 계산
            start_token_idx = template_start_idx + len(response_template_tokens)
            
            # response_template 이후의 모든 토큰을 라벨로 설정
            for i in range(start_token_idx, len(input_ids)):
                labels[i] = input_ids[i]
        else:
            print("Response template이 input_ids 내에서 발견되지 않았습니다.")
        
        feature['labels'] = labels  # 라벨 설정
        
    return tokenizer.pad(features, return_tensors="pt")  # 배치 패딩 처리

### Metric 설정

In [15]:
# 모델의 logits 를 조정하여 정답 토큰 부분만 출력하도록 설정
def preprocess_logits_for_metrics(logits, labels): 
    logits = logits if not isinstance(logits, tuple) else logits[0]
    logit_idx = [tokenizer.vocab["1"], tokenizer.vocab["2"], tokenizer.vocab["3"], tokenizer.vocab["4"], tokenizer.vocab["5"]]
    logits = logits[:, -2, logit_idx] # -2: answer token, -1: eos token
    return logits

# metric 로드
acc_metric = evaluate.load("accuracy")

# 정답 토큰 매핑
int_output_map = {"1": 0, "2": 1, "3": 2, "4": 3, "5": 4}

# <end_of_turn> 대신 </s>로 대체하여 정답만 남기고 나머지 제거
def extract_answer_from_label(label):
    return label.split()[-1]

# metric 계산 함수
def compute_metrics(evaluation_result):
    logits, labels = evaluation_result 
    
    print("라벨 길이: ", len(labels))
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    labels = [label for label in labels if label.strip() != '']
    labels = [label.replace("### Assistant:", "").strip() for label in labels]
    labels = list(map(lambda x: x.split("</s>")[0].strip(), labels)) 
    labels = list(map(extract_answer_from_label, labels)) 
    labels = list(map(lambda x: int_output_map[x], labels))
    
    print("디코딩 후 레이블: ", labels)

    # 소프트맥스 함수를 사용하여 로그로 변환
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
    predictions = np.argmax(probs, axis=-1)

    # 정확도 계산
    acc = acc_metric.compute(predictions=predictions, references=labels)
    return acc

### Train

In [16]:
# pad token 설정
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '<|im_end|>',
 'unk_token': '<unk>',
 'pad_token': '<|im_end|>'}

In [17]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:2"

for name, param in model.named_parameters():
    if "transformer.h.0" in name:  # 예: 초기 레이어를 동결
        param.requires_grad = False

In [18]:
import torch

# # GPU가 사용 가능한지 확인
if torch.cuda.is_available():
    device = torch.device("cuda")
    model.to(device)
else:
    device = torch.device("cpu")
    print("CUDA를 사용할 수 없습니다. CPU로 모델을 학습합니다.")

You shouldn't move a model that is dispatched using accelerate hooks.


In [19]:
from functools import partial

tokenizer.padding_side = 'right'

# custom_data_collator를 tokenizer와 response_template을 고정한 상태로 생성
fixed_data_collator = partial(custom_data_collator, tokenizer=tokenizer, response_template="Assistant:\n")

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

sft_config = SFTConfig(
    do_train=True,
    do_eval=True,
    fp16=True,
    lr_scheduler_type="cosine",
    max_seq_length=2048,
    output_dir="outputs_gemma",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.05,
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=2,
    save_only_model=True,
    report_to="none",
    load_best_model_at_end=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=fixed_data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    peft_config=peft_config,
    args=sft_config,
    callbacks=[early_stopping]
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
%%time
from accelerate import Accelerator

accelerator = Accelerator()
accelerator.prepare(trainer)

trainer.train()

## Inference

In [20]:
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from peft import PeftModel

# TODO 학습된 Checkpoint 경로 입력
checkpoint_path = "../../data/outputs_solar_david/checkpoint-906"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
)

model = AutoModelForCausalLM.from_pretrained(
    "davidkim205/komt-solar-10.7b-sft-v5",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map="auto",
)

model = PeftModel.from_pretrained(
    model,
    checkpoint_path,
    torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(
    "davidkim205/komt-solar-10.7b-sft-v5",
    trust_remote_code=True,
)

tokenizer.chat_template = """{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:' + message['content']+'

'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:
' + message['content']+'

'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:
'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:
' }}{% endif %}{% endfor %}"""


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [21]:
# Load the test dataset
# TODO Test Data 경로 입력
test_df = pd.read_csv('../../data/test.csv')

# Flatten the JSON dataset
records = []
for _, row in test_df.iterrows():
    problems = literal_eval(row['problems'])
    record = {
        'id': row['id'],
        'paragraph': row['paragraph'],
        'question': problems['question'],
        'choices': problems['choices'],
        'answer': problems.get('answer', None),
        "question_plus": problems.get('question_plus', None),
    }
    # Include 'question_plus' if it exists
    if 'question_plus' in problems:
        record['question_plus'] = problems['question_plus']
    records.append(record)
        
# Convert to DataFrame
test_df = pd.DataFrame(records)

In [22]:
test_dataset = []
for i, row in test_df.iterrows():
    choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(row["choices"])])
    len_choices = len(row["choices"])
    
    # <보기>가 있을 때
    if row["question_plus"]:
        user_message = PROMPT_QUESTION_PLUS.format(
            paragraph=row["paragraph"],
            question=row["question"],
            question_plus=row["question_plus"],
            choices=choices_string,
        )
    # <보기>가 없을 때
    else:
        user_message = PROMPT_NO_QUESTION_PLUS.format(
            paragraph=row["paragraph"],
            question=row["question"],
            choices=choices_string,
        )

    test_dataset.append(
        {
            "id": row["id"],
            "messages": [
                {"role": "system", "content": "지문을 읽고 질문의 답을 구하세요."},
                {"role": "user", "content": user_message},
            ],
            "label": row["answer"],
            "len_choices": len_choices,
        }
    )

In [23]:
import os
import torch

# # GPU가 사용 가능한지 확인
if torch.cuda.is_available():
    device = torch.device("cuda")
    model.to(device)
else:
    device = torch.device("cpu")
    print("CUDA를 사용할 수 없습니다. CPU로 모델을 학습합니다.")
    
for name, param in model.named_parameters():
    if "transformer.h.0" in name:  # 예: 초기 레이어를 동결
        param.requires_grad = False
    
    
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:2" 
torch.cuda.empty_cache()

In [24]:
%%time

infer_results = []

pred_choices_map = {0: "1", 1: "2", 2: "3", 3: "4", 4: "5"}

model.eval()
with torch.inference_mode():
    for data in tqdm(test_dataset):
        _id = data["id"]
        messages = data["messages"]
        len_choices = data["len_choices"]

        outputs = model(
            tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
            ).to("cuda")
        )

        logits = outputs.logits[:, -1].flatten().cpu()

        target_logit_list = [logits[tokenizer.vocab[str(i + 1)]] for i in range(len_choices)]

        probs = (
            torch.nn.functional.softmax(
                torch.tensor(target_logit_list, dtype=torch.float32)
            )
            .detach()
            .cpu()
            .numpy()
        )

        predict_value = pred_choices_map[np.argmax(probs, axis=-1)]
        row = {"id": _id, "answer": predict_value}
                
        target_logit_list = [logit.item() for logit in target_logit_list]
        if len_choices < len(pred_choices_map):
            target_logit_list += [None] * (len(pred_choices_map) - len_choices)
        for i, logit in enumerate(target_logit_list):
            row[f"logit_{pred_choices_map[i]}"] = logit

        infer_results.append(row)

100%|██████████| 869/869 [37:47<00:00,  2.61s/it]  

CPU times: user 24min 42s, sys: 13min 1s, total: 37min 44s
Wall time: 37min 47s





In [None]:
pd.DataFrame(infer_results).to_csv("output_solar_david.csv", index=False)

: 

In [None]:
pd.DataFrame(infer_results)