In [1]:

from datasets import load_dataset

#데이터셋을 입력하세요
data = load_dataset("squarelike/sharegpt_deepl_ko_translation")

In [2]:
data

DatasetDict({
    train: Dataset({
        features: ['num', 'korean', 'english'],
        num_rows: 200524
    })
})

In [3]:
def makedata(x):
    if (x['num'] % 2 == 0):
        return {'text': f"### 한국어: {x['korean']}</끝>\n### 영어: {x['english']}</끝>"}
    else:
        return {'text': f"### 영어: {x['english']}</끝>\n### 한국어: {x['korean']}</끝>"}

data = data.map(
    makedata
)

Map:   0%|          | 0/200524 [00:00<?, ? examples/s]

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

model_id = "EleutherAI/polyglot-ko-1.3b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)

Map:   0%|          | 0/200524 [00:00<?, ? examples/s]

In [6]:
data['train'][100000]['text']

'### 한국어: 파일 연결을 포함하여 등록된 응용 프로그램에 대한 정보를 저장하는 레지스트리 루트 키</끝>\n### 영어: which registry root key stores information about registered applications including file associations</끝>'

In [9]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [11]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 1572864 || all params: 729403392 || trainable%: 0.21563705588032142
