# Loar config, qnantization

In [None]:

from transformers import BitsAndBytesConfig
import torch

quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)

from peft import LoraConfig
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=16,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

# Load model

In [None]:
model = 'result/GPT_small_en.pt'

from peft import PeftModel
model = PeftModel.from_pretrained(model, peft_config, is_trainable=True)
model.print_trainable_parameters()

from transformers import MarianTokenizer
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ko-en')
pad_idx = tokenizer.pad_token_id

eos_idx = tokenizer.eos_token_id

# Convert to Alpaca format

In [None]:

def convert_to_alpaca_format(instruction, response):
    alpaca_format_str = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\
    \n\n### Instruction:\n{instruction}\n\n### Response:\n{response}\
    """

    return alpaca_format_str
def prompt_formatting_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        alpaca_formatted_str = convert_to_alpaca_format(instruction, output) + eos_idx
        texts.append(alpaca_formatted_str)
    return { "text" : texts, }

# dataset

In [None]:
# Dataset Load
from datasets import load_dataset

dataset = load_dataset("yahma/alpaca-cleaned", split="train")

dataset = dataset.shuffle(seed=42)
no_input_dataset = dataset.filter(lambda example: example['input'] == '')
mapped_dataset = no_input_dataset.map(prompt_formatting_func, batched=True)
split_dataset = mapped_dataset.train_test_split(test_size=0.01, seed=42)

train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Data Collator

In [None]:

from trl import DataCollatorForCompletionOnlyLM
data_collator_param = {}
response_template = "### Response:\n"
collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer, mlm=False)
data_collator_param["data_collator"] = collator

output_dir = "/fine_tune_output"

In [None]:
# tensorboard 설정
%load_ext tensorboard
%tensorboard --logdir '{output_dir}/runs'

# Training

In [None]:

from trl import SFTTrainer
from transformers import TrainingArguments

training_arguments = TrainingArguments(
  output_dir=output_dir,
  report_to = "tensorboard",
  per_device_train_batch_size = 2,
  per_device_eval_batch_size = 2,
  gradient_accumulation_steps = 8,
  warmup_steps = 50,
  max_steps = 100,
  eval_steps=10,
  save_steps=50,
  evaluation_strategy="steps",
  save_strategy="steps",
  learning_rate = 1e-4,
  logging_steps = 1,
  optim = "adamw_8bit",
  weight_decay = 0.01,
  lr_scheduler_type = "constant_with_warmup",
  seed = 42,
  gradient_checkpointing = True,
  gradient_checkpointing_kwargs={'use_reentrant':True}
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    peft_config=peft_config,
    dataset_text_field = "text",
    max_seq_length = 2048,
    dataset_num_proc = 2,
    packing = False,
    args = training_arguments,
    **data_collator_param
)

In [None]:
train_stats = trainer.train()