In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments

import torch
import wandb


wandb.init(project='Hanghae99_8basic', name=f"instruction-tuning")


# 데이터셋을 90%:10% 비율로 나누어 train과 validation 데이터셋 생성
dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")
split_data = dataset.train_test_split(test_size=0.2)

train_dataset = split_data['train']
eval_dataset = split_data['test']


# 모델과 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")


# 'formatting_prompts_func'는 데이터셋 예시를 입력 받아, 'Instruction'과 'Output'을 적절한 형식으로 변환합니다.
# 각 'Instruction'과 'Output' 쌍을 연결하여 모델이 이를 처리할 수 있도록 합니다.
# 주어진 형식: '### Question: [Instruction]\n### Answer: [Output]'
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

response_template = " ### Answer:"

collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)


from transformers import TrainerCallback, TrainerState, TrainerControl

# 콜백 클래스 정의
class WandbLoggingCallback(TrainerCallback):
    def on_log(self, args, state: TrainerState, control: TrainerControl, logs=None, **kwargs):
        if logs is not None:
            # train loss 기록
            if "loss" in logs:
                wandb.log({"train/loss": logs["loss"], "step": state.global_step})

            # validation 평가 및 loss 기록 (평가 주기에 따라 실행됨)
            if "eval_loss" in logs:
                wandb.log({"eval/loss": logs["eval_loss"], "step": state.global_step})


# TrainingArguments로 로그 빈도 및 기타 학습 설정 관리
training_args = TrainingArguments(
    output_dir="/tmp/clm-instruction-tuning",  # 출력 디렉터리 설정
    logging_steps=500,                         # 로그 빈도 설정 (매 100 스텝마다 로그 기록)
    evaluation_strategy="steps",               # 평가 전략을 'steps'로 설정
    eval_steps=500,                            # 평가 빈도 설정
    save_steps=0,                              # 저장 비활성화
    save_total_limit=0,                        # 체크포인트 개수 제한 없음
    save_strategy="no",                        # 'no'로 설정하여 저장 완전 비활성화
    per_device_train_batch_size=4,      # 학습 시 배치 크기 설정
    per_device_eval_batch_size=4        # 평가 시 배치 크기 설정
)

# Trainer 생성
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,                         # TrainingArguments로 설정 전달
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    callbacks=[WandbLoggingCallback()]          # 콜백 추가
)


trainer.train()


max_memory_allocated_gb = round(torch.cuda.max_memory_allocated(0) / 1024**3, 1)
print('Max Alloc:', max_memory_allocated_gb, 'GB')
wandb.log({"max_memory_allocated_gb": max_memory_allocated_gb})

[34m[1mwandb[0m: Currently logged in as: [33miamkimhongil92[0m ([33miamkimhongil92-lumenasoft[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


README.md:   0%|          | 0.00/677 [00:00<?, ?B/s]

(…)-00000-of-00001-e270777bb989ac86.parquet:   0%|          | 0.00/3.45M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20022 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


model.safetensors:   0%|          | 0.00/662M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]



Map:   0%|          | 0/16017 [00:00<?, ? examples/s]

Map:   0%|          | 0/4005 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
500,1.9045,1.62646
1000,1.5676,1.481831
1500,1.4568,1.362543
2000,1.3839,1.296385
2500,1.3281,1.249257
3000,1.2874,1.213777
3500,1.1912,1.192925
4000,1.1593,1.136221
4500,0.948,1.130851
5000,0.9288,1.108425


Max Alloc: 22.7 GB


In [2]:
wandb.log({"runtime": 34 * 60 + 41})


In [3]:
wandb.finish()

0,1
eval/loss,██▆▆▅▅▅▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▆███████▇▇▇▇▇█████▇▇▇▇▇
eval/samples_per_second,█▃▁▁▁▁▁▁▁▂▂▂▂▂▁▁▁▁▁▂▂▂▂▂
eval/steps_per_second,█▃▁▁▁▁▁▁▁▂▂▂▂▂▁▁▁▁▁▂▂▂▂▂
max_memory_allocated_gb,▁
runtime,▁
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,█▅▂▂▄▅▂▁▂▂▄▁▃▂▄▁▂▁▁▃▂▁▃▂

0,1
eval/loss,0.96011
eval/runtime,30.3191
eval/samples_per_second,132.095
eval/steps_per_second,33.048
max_memory_allocated_gb,22.7
runtime,2081.0
step,12000.0
total_flos,1.770568708472832e+16
train/epoch,3.0
train/global_step,12015.0
