In [1]:
!pip install datasets
!pip install accelerate
!pip install -U bitsandbytes
!pip install transformers
!pip install peft
!pip install sklearn
!pip install trl
!python -m pip install --upgrade pip

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manyli

In [2]:
import warnings
warnings.filterwarnings("ignore")
import os
import json
import numpy as np
import pandas as pd
import re
import string
from collections import Counter
from tqdm import tqdm
from peft import LoraConfig, get_peft_model
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from accelerate import Accelerator
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
def remove_newlines(text):
    return text.replace('\n', '')

def remove_double(text):
    return text.replace('  ', '')
    
def prepare_data(df):
    return [f"너는 주어진 문맥을 토대로 질문에 대해 간결하게 답변하는 챗봇이야.\
                        주어진 4개의 단계를 따라 질문에 대한 답변을 해야해.\
                        단계 1. 문맥에서 질문에 대한 내용이 있는 부분을 찾아줘.\
                        단계 2. 단계1에서 찾은 부분에서 질문에 대한 답변을 찾아줘.\
                        단계 3. 문맥에 있는 답변을 바꾸거나 변형하지 말고 끝부분만 다듬어줘.\
                        단계 4. 답변이 10글자 이내라면 간단하게 키워드로 알려줘.\
                        문맥: {row['context']}\
                        질문: {row['question']}\
                        답변: {row['answer']}" for _, row in df.iterrows()]

def preprocess_function(examples, tokenizer):
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=1024)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

def normalize_answer(s):
    def remove_(text):
        text = re.sub("'", " ", text)
        text = re.sub('"', " ", text)
        text = re.sub('《', " ", text)
        text = re.sub('》', " ", text)
        text = re.sub('<', " ", text)
        text = re.sub('>', " ", text)
        text = re.sub('〈', " ", text)
        text = re.sub('〉', " ", text)
        text = re.sub("\(", " ", text)
        text = re.sub("\)", " ", text)
        text = re.sub("‘", " ", text)
        text = re.sub("’", " ", text)
        return text

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(remove_(s))))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()

    prediction_Char = []
    for tok in prediction_tokens:
        now = [a for a in tok]
        prediction_Char.extend(now)

    ground_truth_Char = []
    for tok in ground_truth_tokens:
        now = [a for a in tok]
        ground_truth_Char.extend(now)

    common = Counter(prediction_Char) & Counter(ground_truth_Char)
    num_same = sum(common.values())
    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(prediction_Char)
    recall = 1.0 * num_same / len(ground_truth_Char)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1

def evaluate(ground_truth_df, predictions_df):
    predictions = dict(zip(predictions_df['question'], predictions_df['answer']))
    f1 = exact_match = total = 0

    for index, row in ground_truth_df.iterrows():
        question_text = row['question']
        ground_truths = row['answer']
        total += 1
        if question_text not in predictions:
            continue
        prediction = predictions[question_text]
        f1 = f1 + f1_score(prediction, ground_truths)

    f1 = 100.0 * f1 / total
    return {'f1': f1}

In [12]:
def main():
    # 데이터 로딩 및 전처리
    all_data = pd.read_csv('./train.csv')
    all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)

    all_data['context'] = all_data['context'].apply(remove_newlines)
    all_data['context'] = all_data['context'].apply(remove_double)

    # 데이터를 훈련 및 검증 세트로 분할
    val_data = all_data[:100]
    train_data = all_data[100:]

    train_texts = prepare_data(train_data)
    val_texts = prepare_data(val_data)
    
    model_id = "davidkim205/ko-gemma-2-9b-it"

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        quantization_config=bnb_config,
        trust_remote_code=True
    )

    config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'down_proj', 'up_proj', 'lm_head'],
        lora_dropout=0.05,
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, config)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    model.gradient_checkpointing_enable()
    
    # 데이터셋 준비
    train_dataset = Dataset.from_dict({"text": train_texts})
    val_dataset = Dataset.from_dict({"text": val_texts})

    # 데이터셋 전처리
    train_dataset = train_dataset.map(lambda examples: preprocess_function(examples, tokenizer), batched=True, remove_columns=train_dataset.column_names)
    val_dataset = val_dataset.map(lambda examples: preprocess_function(examples, tokenizer), batched=True, remove_columns=val_dataset.column_names)

    # 데이터 콜레이터
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # 학습 인자 설정
    training_args = TrainingArguments(
        output_dir="./davidkim205_results",
        num_train_epochs=10,
        max_grad_norm=0.3,
        learning_rate=2e-4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=400,
        fp16=True,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=500,
        save_steps=500,
        gradient_accumulation_steps=8,
        dataloader_num_workers=4,  # num_workers 설정
        optim="adamw_hf",
        load_best_model_at_end=True,
    )

    # Trainer 초기화
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        peft_config=config,
        data_collator=data_collator,
    )

    # 학습 시작
    trainer.train()

    # 파인튜닝된 모델 저장
    model.save_pretrained("./davidkim205_fine_tuned_model")

In [None]:
main()

Map:   0%|          | 0/33616 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable

Step,Training Loss,Validation Loss
