In [6]:
import torch
major_version,minor_version=torch.cuda.get_device_capability()
print(major_version,minor_version)

8 9


In [4]:
%%capture
# Colab에서 torch 2.2.1을 사용하고 있으므로, 패키지 충돌을 방지하기 위해 별도로 설치해야 합니다.
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # 새로운 GPU(예: Ampere, Hopper GPUs - RTX 30xx, RTX 40xx, A100, H100, L40)에 사용하세요.
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # 오래된 GPU(예: V100, Tesla T4, RTX 20xx)에 사용하세요.
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [1]:
from unsloth import FastLanguageModel
max_seq_length=4096
dtype=None
load_in_4bit=True
model,tokenizer=FastLanguageModel.from_pretrained(
    model_name='MLP-KTLim/llama-3-Korean-Bllossom-8B',
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
model=FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'o_proj',
        'gate_proj',
        'up_proj',
        'down_proj'
    ],
    bias='none',
    use_gradient_checkpointing='unsloth',
    random_state=123,
    use_rslora=False,
    loftq_config=None
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.7 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [4]:
from datasets import load_dataset
EOS_TOKEN=tokenizer.eos_token
alpaca_prompt="""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

def formatting_prompts_func(example):
    instructions=example['instruction']
    outputs=example['output']
    texts=[]
    for instruction,output in zip(instructions,outputs):
        texts.append(alpaca_prompt.format(instruction,output)+EOS_TOKEN)
    return {
        'text':texts
    }

dataset=load_dataset('bigdefence/custom',split='train')
dataset=dataset.map(formatting_prompts_func,batched=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments

tokenizer.padding_size='right'
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=dataset,
    dataset_text_field='text',
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=3,
        max_steps=100,
        do_eval=True,
        evaluation_strategy='steps',
        logging_steps=1,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim='adamw_8bit',
        weight_decay=0.01,
        lr_scheduler_type='cosine',
        seed=123,
        output_dir='outputs'
    )
)



Map (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [8]:
# 현재 메모리 상태를 보여주는 코드
gpu_stats = torch.cuda.get_device_properties(0)  # GPU 속성 가져오기
start_gpu_memory = round(
    torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3
)  # 시작 시 예약된 GPU 메모리 계산
max_memory = round(
    gpu_stats.total_memory / 1024 / 1024 / 1024, 3
)  # GPU의 최대 메모리 계산
print(
    f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB."
)  # GPU 이름과 최대 메모리 출력
print(f"{start_gpu_memory} GB of memory reserved.")  # 예약된 메모리 양 출력

GPU = NVIDIA L4. Max memory = 22.168 GB.
5.633 GB of memory reserved.


In [9]:
trainer_stats=trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10 | Num Epochs = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.2421,2.167339
2,2.1,2.122009
3,2.1522,1.917349
4,1.9054,1.585207
5,1.5975,1.188428
6,1.2051,0.911244
7,0.9431,0.719377
8,0.7382,0.550289
9,0.5801,0.4361
10,0.4428,0.329999


In [10]:
# 최종 메모리 및 시간 통계를 보여줍니다.
used_memory = round(
    torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3
)  # 사용된 최대 메모리를 GB 단위로 계산합니다.
used_memory_for_lora = round(
    used_memory - start_gpu_memory, 3
)  # LoRA를 위해 사용된 메모리를 GB 단위로 계산합니다.
used_percentage = round(
    used_memory / max_memory * 100, 3
)  # 최대 메모리 대비 사용된 메모리의 비율을 계산합니다.
lora_percentage = round(
    used_memory_for_lora / max_memory * 100, 3
)  # 최대 메모리 대비 LoRA를 위해 사용된 메모리의 비율을 계산합니다.
print(
    f"{trainer_stats.metrics['train_runtime']} seconds used for training."
)  # 훈련에 사용된 시간을 초 단위로 출력합니다.
print(
    # 훈련에 사용된 시간을 분 단위로 출력합니다.
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(
    f"Peak reserved memory = {used_memory} GB."
)  # 예약된 최대 메모리를 GB 단위로 출력합니다.
print(
    f"Peak reserved memory for training = {used_memory_for_lora} GB."
)  # 훈련을 위해 예약된 최대 메모리를 GB 단위로 출력합니다.
print(
    f"Peak reserved memory % of max memory = {used_percentage} %."
)  # 최대 메모리 대비 예약된 메모리의 비율을 출력합니다.
print(
    f"Peak reserved memory for training % of max memory = {lora_percentage} %."
)  # 최대 메모리 대비 훈련을 위해 예약된 메모리의 비율을 출력합니다.

485.9731 seconds used for training.
8.1 minutes used for training.
Peak reserved memory = 7.4 GB.
Peak reserved memory for training = 1.767 GB.
Peak reserved memory % of max memory = 33.381 %.
Peak reserved memory for training % of max memory = 7.971 %.


In [14]:
from transformers import StoppingCriteria,StoppingCriteriaList
class stopOnTokens(StoppingCriteria):
    def __init__(self,stop_token_id):
        self.stop_token_id=stop_token_id
    def __call__(self, input_ids, scores, **kwargs):
        return (
            self.stop_token_id in input_ids[0]
        )
stop_token="<|eot_id|>"
stop_token_id=tokenizer.encode(stop_token,add_special_tokens=False)[0]
stopping_criteria=StoppingCriteriaList([stopOnTokens(stop_token_id)])

In [15]:
from transformers import TextStreamer
FastLanguageModel.for_inference(model)
inputs=tokenizer(
    [
        alpaca_prompt.format(
            "카카오vx에 대해 설명해줘",
            "",
        )
    ],
    return_tensors='pt'
).to('cuda')

text_streamer=TextStreamer(tokenizer)
_=model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=max_seq_length,
    stopping_criteria=stopping_criteria
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
카카오vx에 대해 설명해줘

### Response:
KakaoVX는 카카오 계열사로, 가상현실(VR) 및 확장현실(XR) 기술을 활용한 서비스를 개발하고 제공하는 회사입니다. 주요 사업 영역은 VR/XR 콘텐츠 개발, 플랫폼 운영, 그리고 관련 기술 연구 및 개발입니다.<|eot_id|>


In [16]:
base_model='MLP-KTLim/llama-3-Korean-Bllossom-8B'
huggingface_repo='llama-3-Bllossom-unsloth-finetune'
save_method=(
    'merged_16bit'
)

In [18]:
model.push_to_hub_merged(
    huggingface_repo,
    tokenizer,
    save_method=save_method,
    token='허깅페이스 토큰'
)

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 16.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 30.96 out of 52.96 RAM for saving.


 78%|███████▊  | 25/32 [00:01<00:00, 18.23it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:12<00:00,  2.50it/s]


Unsloth: Saving to organization with address bigdefence/llama-3-Bllossom-unsloth-finetune
Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving to organization with address bigdefence/llama-3-Bllossom-unsloth-finetune
Unsloth: Uploading all files... Please wait...


model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/None/llama-3-Bllossom-unsloth-finetune
