In [1]:
#-*- encoding: utf-8 -*- 

# 1. Weight and Bias Login

In [12]:
import wandb
import os
os.environ["WANDB_PROJECT"]="translate_machine_llama3ko_with_orgin_data_300"

wandb.login()

True

# 2. Login Huggingface

In [13]:
from huggingface_hub import login
from dotenv import load_dotenv

load_dotenv()


login(token= os.environ["HF_TOKEN"])

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/aeolian83/.cache/huggingface/token
Login successful


# 3. Dataset Load

In [4]:
from datasets import load_dataset, Dataset, DatasetDict
import pickle

In [5]:
with open('./data/train_data_300.pkl', 'rb') as file:
    train_data = pickle.load(file)
len(train_data)

with open('./data/validation_data_28.pkl', 'rb') as file:
    test_data = pickle.load(file)
len(test_data)

28

In [6]:
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

# DatasetDict로 "train"과 "test" 데이터셋 묶기
dataset_dict = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

In [7]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['english', 'korean', 'terms'],
        num_rows: 300
    })
    test: Dataset({
        features: ['english', 'korean', 'terms'],
        num_rows: 28
    })
})

In [8]:
dataset_dict["train"][0]

{'english': 'Explainable AI is becoming increasingly important as AI systems are integrated into various industries. With the rise of cloud computing, massive datasets can be processed and analyzed more efficiently, but this often comes at the cost of transparency. By combining explainable AI with cloud computing, organizations can ensure that their AI models are both powerful and understandable. Meanwhile, edge computing allows for data processing closer to the source, which can enhance real-time decision-making capabilities. Integrating explainable AI with edge computing can further improve the trustworthiness and reliability of these real-time systems.',
 'korean': '설명 가능한 AI(explainable AI)는 AI 시스템이 다양한 산업에 통합됨에 따라 점점 더 중요해지고 있습니다. 클라우드 컴퓨팅(cloud computing)의 발전으로 대규모 데이터셋을 더 효율적으로 처리하고 분석할 수 있지만, 이는 종종 투명성의 대가로 이루어집니다. 설명 가능한 AI(explainable AI)와 클라우드 컴퓨팅(cloud computing)을 결합하면 조직은 강력하면서도 이해할 수 있는 AI 모델을 보장할 수 있습니다. 한편, 엣지 컴퓨팅(edge computing)은 데이터 처리를 소스에 더 가깝게 하여 실시간 의사 결정 능력을 향상시킬

# 4. Loading the Model

In [9]:
model_id = "beomi/Llama-3-KoEn-8B"
device_map = {"": 0}
cache_model_dir="/mnt/t7/.cache/huggingface/models"

In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [19]:
# Settings for 4-bit QLoRA Training(4bit QLoRA 학습을 위한 설정)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_compute_dtype=torch.bfloat16, # Nvidia의 Ampere 아키텍처 이후 가속기는 bf16으로 속도 향상을 꾀할수 있다. 
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# bnb_4bit_quant_type="nf4" 설정상 기본값은 bnb_4bit_quant_type="fp4"이나 허깅페이스 저자들에 의하면
# 경험적 결과로 "nf4"가 결과가 더 좋았다고 한다. https://huggingface.co/blog/4bit-transformers-bitsandbytes
# bnb_4bit_use_double_quant=True로 하면 매개변수단 0.4bit을 추가로 절약 할 수 있다고 한다. 

In [12]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map=device_map, cache_dir=cache_model_dir, trust_remote_code=True)
model.config.use_cache = False

# model.config.pretraining_tp = 1
# 종종 QLoRA 코드에 이 코드가 보이는데 병렬 학습에 쓰이는 코드로 보인다. 

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_model_dir)
tokenizer.padding_side = "left"
tokenizer.add_special_tokens({'pad_token': '<PAD>'})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [14]:
model.resize_token_embeddings(len(tokenizer)) # pad_token이 추가되었으므로 embedding과 language modeling head를 resize

Embedding(128257, 4096)

# 5. LoRA Setup

In [15]:
from peft import LoraConfig, get_peft_model

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

In [16]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

# 6. Formatting Dataset

In [17]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['english', 'korean', 'terms'],
        num_rows: 300
    })
    test: Dataset({
        features: ['english', 'korean', 'terms'],
        num_rows: 28
    })
})

In [18]:
dataset = dataset_dict['train']

In [19]:
dataset

Dataset({
    features: ['english', 'korean', 'terms'],
    num_rows: 300
})

In [20]:
# Formatting function
def formatting_func(example):
    output_texts = []
    for i in range(len(example["english"])):
        text = f"Translate input sentence to Korean \n### Input: {example['english'][i]} \n### Translated: {example['korean'][i]}" + tokenizer.eos_token
        output_texts.append(text)

    return output_texts


response_template = " \n### Translated:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# https://huggingface.co/docs/trl/sft_trainer#using-tokenids-directly-for-responsetemplate 참고


In [21]:
# response_template = " \n### Translated:"  # We added context here: "\n". This is enough for this tokenizer
# response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)  # Now we have it like in the dataset texts: `[2277, 29937, 4007, 22137, 29901]`

# # data_collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

# 7. Training Argument Setup

In [22]:
from transformers import TrainingArguments

In [23]:
checkpoint_dir = "./checkpoint/translate_machine_llama3ko_nonintsuct_origindata300_01"

In [24]:
output_dir = checkpoint_dir
per_device_train_batch_size = 1
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
report_to="wandb"
save_steps = 20
save_total_limit=5
num_train_epochs = 2
logging_steps = 20
learning_rate = 2e-4
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "constant"

In [25]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    logging_steps=logging_steps,
    report_to = report_to,
    learning_rate=learning_rate,
    bf16=True,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [26]:
from trl import SFTTrainer

max_seq_length = 1024

In [27]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_arguments,
    formatting_func=formatting_func,
    data_collator=collator,
)



Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [28]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

# 8. Training

In [29]:
# trainer.train()

In [30]:
trainer.train()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011114492888898693, max=1.0…

Step,Training Loss
20,0.3595
40,0.254
60,0.2387
80,0.2263
100,0.219
120,0.2196
140,0.209
160,0.1832
180,0.1459
200,0.1586




TrainOutput(global_step=300, training_loss=0.19696848551432292, metrics={'train_runtime': 499.2441, 'train_samples_per_second': 1.202, 'train_steps_per_second': 0.601, 'total_flos': 9466790000295936.0, 'train_loss': 0.19696848551432292, 'epoch': 2.0})

In [47]:
lora_model_save_dir = "./results/translate_machine_llama3ko_origindata300_01"

In [48]:
trainer.save_model(lora_model_save_dir)

# model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
# model_to_save.save_pretrained(lora_model_save_dir, save_embedding_layers = True)

In [26]:
model_id = "beomi/Llama-3-KoEn-8B"
device_map = {"": 0}
cache_model_dir="/mnt/t7/.cache/huggingface/models"

In [27]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map=device_map, cache_dir=cache_model_dir, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [28]:
tokenizer = AutoTokenizer.from_pretrained(lora_model_save_dir)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [29]:
lora_config = LoraConfig.from_pretrained(lora_model_save_dir)
model.resize_token_embeddings(len(tokenizer))

Embedding(128257, 4096)

In [30]:
loaded_model = PeftModel.from_pretrained(
    model=model,
    model_id=lora_model_save_dir
)

In [31]:
# Formatting function
def formatting_func_inference(example):
    output_texts = []
    for i in range(len(example["Input"])):
        text = f"Translate input sentence to Korean \n### Input: {example['Input'][i]} \n### Translated: "
        inputs = tokenizer(text, return_tensors="pt")
        output_texts.append(inputs)

    return output_texts


example = {"Input": [
    "Despite their sample quality, our models do not have competitive log likelihoods compared to other likelihood-based models.",
    "Our models do, however, have log likelihoods better than the large estimates annealed importance sampling has been reported to produce for energy based models and score matching.",
    "We focus on Latent Diffusion Models since they can perform a wide range of generative tasks. This work shows that simply fine-tuning a small part of the generative model.",
]}

tokenized_example = formatting_func_inference(example)
tokenized_example

[{'input_ids': tensor([[128000,  28573,   1988,  11914,    311,  16526,    720,  14711,   5688,
              25,  18185,    872,   6205,   4367,     11,   1057,   4211,    656,
             539,    617,  15022,   1515,  29736,     82,   7863,    311,   1023,
           29736,   6108,   4211,     13,    720,  14711,   4149,  22851,     25,
             220]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[128000,  28573,   1988,  11914,    311,  16526,    720,  14711,   5688,
              25,   5751,   4211,    656,     11,   4869,     11,    617,   1515,
           29736,     82,   2731,   1109,    279,   3544,  17989,  86997,   5962,
           12939,  25936,    706,   1027,   5068,    311,   8356,    369,   4907,
            3196,   4211,    323,   5573,  12864,     13,    720,  14711,   4149,
           22851,     25,    220]]), 'attention_mask': tensor

In [32]:
outputs = []
with torch.cuda.amp.autocast():
    for ex in tokenized_example:
        ex = ex.to(model.device)
        pred = model.generate(
            **ex,
            max_new_tokens=1024,
            pad_token_id=tokenizer.pad_token_id,
        )
        outputs.append(pred)

outputs = [tokenizer.batch_decode(t, skip_special_tokens=True)[0] for t in outputs]
for o in outputs:
    print(o)
    print("#" * 100)

Translate input sentence to Korean 
### Input: Despite their sample quality, our models do not have competitive log likelihoods compared to other likelihood-based models. 
### Translated:  우리 모델은 샘플 품질에도 불구하고, 다른 가능성 기반 모델에 비해 경쟁력 있는 로그 가능성을 갖지 않습니다.- 1st
- 11:00 pm
- 2nd
- 11:00 pm
- 3rd
- 11:00 pm
- 4th
- 11:00 pm
- 5th
- 11:00 pm
- 6th
- 11:00 pm
- 7th
- 11:00 pm
- 8th
- 11:00 pm
- 9th
- 11:00 pm
- 10th
- 11:00 pm
- 11th
- 11:00 pm
- 12th
- 11:00 pm
- 13th
- 11:00 pm
- 14th
- 11:00 pm
- 15th
- 11:00 pm
- 16th
- 11:00 pm
- 17th
- 11:00 pm
- 18th
- 11:00 pm
- 19th
- 11:00 pm
- 20th
- 11:00 pm
- 21st
- 11:00 pm
- 22nd
- 11:00 pm
- 23rd
- 11:00 pm
- 24th
- 11:00 pm
- 25th
- 11:00 pm
- 26th
- 11:00 pm
- 27th
- 11:00 pm
- 28th
- 11:00 pm
- 29th
- 11:00 pm
- 30th
- 11:00 pm- 1st
- 11:00 pm
- 2nd
- 11:00 pm
- 3rd
- 11:00 pm
- 4th
- 11:00 pm
- 5th
- 11:00 pm
- 6th
- 11:00 pm
- 7th
- 11:00 pm
- 8th
- 11:00 pm
- 9th
- 11:00 pm
- 10th
- 11:00 pm
- 11th
- 11:00 pm
- 12th
- 11:00 pm
- 13th
- 11:0