In [1]:
!pip install -r "/requirements.txt"



In [2]:
!pip install bitsandbytes



In [3]:
import torch
import transformers
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import pandas as pd
import numpy as np
import random
from sklearn.metrics import accuracy_score
from peft import LoraConfig, get_peft_model
import evaluate
from tqdm import tqdm
from ast import literal_eval
from transformers import BitsAndBytesConfig

pd.set_option('display.max_columns', None)

In [4]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write

In [5]:
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

set_seed(42)

In [6]:
# 학습 데이터 로드
train_data = pd.read_csv('/balanced_dataset.csv')  # balanced.csv 사용

# 데이터 처리
records = []
for _, row in train_data.iterrows():
    record = {
        'id': row['id'],
        'paragraph': row['paragraph'],
        'question': row['question'],
        'choices': literal_eval(row['choices']),  # 문자열 -> 리스트 변환
        'answer': str(row['answer']),  # answer를 문자열로 변환
        'question_plus': row['question_plus'],   # 필요하면 사용
    }
    records.append(record)

# DataFrame 생성
df = pd.DataFrame(records)

# Hugging Face Dataset으로 변환
dataset = Dataset.from_pandas(df)

In [7]:
model_name = "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"

# # 4-bit 양자화 설정
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,  # 필요에 따라 설정
#     bnb_4bit_quant_type='nf4',       # 'nf4' 또는 'fp4' 선택 가능
#     bnb_4bit_compute_dtype=torch.float16  # 연산 시 사용할 데이터 타입
# )

# # 8-bit 양자화 설정
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True,  # 8-bit 양자화로 변경
#     bnb_8bit_compute_dtype=torch.float16,  # 8-bit 양자화에서 연산 시 데이터 타입
# )


# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,  # 양자화 설정 적용
#     device_map="auto",
#     trust_remote_code=True
# )

#16bit
# 모델 로드
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,  # 16-bit 사용
    trust_remote_code=True
)


tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Configure PEFT (LoRA)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=['q_proj', 'v_proj'],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [8]:
PROMPT_NO_QUESTION_PLUS = """지문:
{paragraph}

질문:
{question}

선택지:
{choices}

1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요. 단 숫자에는 아무런 의미가 없다
정답:"""

PROMPT_QUESTION_PLUS = """지문:
{paragraph}

질문:
{question}

<보기>:
{question_plus}

선택지:
{choices}

1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요. 단 숫자에는 아무런 의미가 없다
정답:"""

In [9]:
processed_dataset = []

# Hugging Face Dataset을 리스트로 변환하여 처리
for i in range(len(dataset)):  # 또는 dataset.to_dict("records")를 사용 가능
    choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(dataset[i]["choices"])])

    # <보기>가 있을 때
    if dataset[i]["question_plus"]:
        user_message = PROMPT_QUESTION_PLUS.format(
            paragraph=dataset[i]["paragraph"],
            question=dataset[i]["question"],
            question_plus=dataset[i]["question_plus"],
            choices=choices_string,
        )
    # <보기>가 없을 때
    else:
        user_message = PROMPT_NO_QUESTION_PLUS.format(
            paragraph=dataset[i]["paragraph"],
            question=dataset[i]["question"],
            choices=choices_string,
        )

    # 메시지 추가
    processed_dataset.append(
        {
            "id": dataset[i]["id"],
            "messages": [
                {"role": "system", "content": "지문을 읽고 질문의 답을 구하세요."},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": f"{dataset[i]['answer']}"}
            ],
            "label": dataset[i]["answer"],
        }
    )

# 리스트를 Pandas DataFrame으로 변환 후 Hugging Face Dataset으로 변환
processed_dataset = Dataset.from_pandas(pd.DataFrame(processed_dataset))


In [10]:
# 메시지 기반 토큰화 함수
def tokenize_function(example):
    chat_input = [
        f"<{message['role']}> {message['content']}" for message in example["messages"]
    ]
    full_input = "\n".join(chat_input)

    tokenized_full = tokenizer(full_input, truncation=True, max_length=2048, add_special_tokens=True)

    # 정답 토큰 위치 계산
    answer_token = tokenizer.convert_tokens_to_ids(str(example["label"]))
    labels = [-100] * len(tokenized_full["input_ids"])
    labels[-2] = answer_token  # 마지막에서 두 번째 위치에 정답 설정

    return {
        "input_ids": tokenized_full["input_ids"],
        "attention_mask": tokenized_full["attention_mask"],
        "labels": labels,
    }


tokenized_dataset = processed_dataset.map(
    tokenize_function,
    batched=False,
    remove_columns=["messages"],
    num_proc=4,
    desc="Tokenizing dataset"
)

Tokenizing dataset (num_proc=4):   0%|          | 0/1982 [00:00<?, ? examples/s]

In [11]:
# 최대 길이 초과 필터링
tokenized_dataset = tokenized_dataset.filter(lambda x: len(x["input_ids"]) <= 2048)

# Train-Test Split
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

# Train-Test Split 후 각각 셔플링 (seed 고정)
train_dataset = tokenized_dataset["train"].shuffle(seed=42)
eval_dataset = tokenized_dataset["test"]

Filter:   0%|          | 0/1982 [00:00<?, ? examples/s]

In [12]:
def data_collator(features):
    max_length = max(len(f["input_ids"]) for f in features)
    input_ids = [f["input_ids"] + [tokenizer.pad_token_id] * (max_length - len(f["input_ids"])) for f in features]
    attention_mask = [f["attention_mask"] + [0] * (max_length - len(f["attention_mask"])) for f in features]
    labels = [f["labels"] + [-100] * (max_length - len(f["labels"])) for f in features]

    return {
        "input_ids": torch.tensor(input_ids, dtype=torch.long),
        "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
        "labels": torch.tensor(labels, dtype=torch.long),
    }



In [13]:
# 로짓 전처리 함수 정의
def preprocess_logits_for_metrics(logits, labels):
    logits = logits[0] if isinstance(logits, tuple) else logits
    # '1'부터 '5'까지의 토큰 ID를 가져옵니다.
    logit_idx = [
        tokenizer.convert_tokens_to_ids("1"),
        tokenizer.convert_tokens_to_ids("2"),
        tokenizer.convert_tokens_to_ids("3"),
        tokenizer.convert_tokens_to_ids("4"),
        tokenizer.convert_tokens_to_ids("5")
    ]
    # 시퀀스의 마지막에서 두 번째 토큰에서만 로짓을 선택합니다.
    logits = logits[:, -2, logit_idx]  # -2: answer token, -1: eos token
    return logits

In [14]:
# 정확도 메트릭 로드
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# 정답 토큰 매핑 정의
answer_map = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4}

# def compute_metrics(eval_preds):
#     logits, labels = eval_preds

#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     labels = [answer_map[label.strip()] for label in labels]

#     probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
#     predictions = torch.argmax(probs, dim=-1).numpy()

#     acc = acc_metric.compute(predictions=predictions, references=labels)

#     f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")

#     return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

# EXAONE 용
def compute_metrics(eval_preds):
    logits, labels = eval_preds

    # -100으로 마스킹된 레이블을 패드 토큰 ID로 교체
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    labels = [answer_map[label.strip()] for label in labels]

    # 소프트맥스 계산을 위해 float32로 변환
    probs = torch.nn.functional.softmax(torch.tensor(logits, dtype=torch.float32), dim=-1)
    predictions = torch.argmax(probs, dim=-1).numpy()

    # 정확도 및 F1 점수 계산
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")

    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}



In [18]:
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="outputs_mistral",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=2,
    max_steps=2100,
    learning_rate=1e-5,  # 학습률 감소
    weight_decay=0.05,
    warmup_steps=200,  # Warmup Steps 추가
    logging_steps=10,
    save_strategy="steps",
    save_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    push_to_hub=True,
    hub_model_id="Toastmachine/exaone_CSAT_test",
    lr_scheduler_type="cosine",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

max_steps is given, it will override any value given in num_train_epochs


In [19]:
# 학습 시작
trainer.train()

trainer.push_to_hub("Toastmachine/exaone_CSAT_test")
tokenizer.push_to_hub("Toastmachine/exaone_CSAT_test")

Step,Training Loss,Validation Loss,Accuracy,F1
50,24.6602,24.109375,0.547739,0.576066
100,12.8586,6.570312,0.572864,0.606195
150,0.3657,0.495605,0.567839,0.600508
200,0.5527,0.488037,0.567839,0.600508
250,0.9587,0.509766,0.572864,0.605428
300,0.9119,0.446777,0.567839,0.601553
350,0.0989,0.468994,0.572864,0.60658
400,0.6981,0.461182,0.562814,0.596508
450,0.5197,0.479248,0.562814,0.596508




README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Toastmachine/exaone_CSAT_test/commit/d904d040c8b24fb3eb218586f78f324b247e8364', commit_message='Upload tokenizer', commit_description='', oid='d904d040c8b24fb3eb218586f78f324b247e8364', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Toastmachine/exaone_CSAT_test', endpoint='https://huggingface.co', repo_type='model', repo_id='Toastmachine/exaone_CSAT_test'), pr_revision=None, pr_num=None)