In [1]:
#-*- encoding: utf-8 -*- 

# 1. Weight and Bias Login

In [1]:
import wandb
import os
os.environ["WANDB_PROJECT"]="translate_machine_llama3ko_with_orgin_data_300"

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maeolian83[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# 2. Login Huggingface

In [2]:
from huggingface_hub import login
from dotenv import load_dotenv

load_dotenv()


login(token= os.environ["HF_TOKEN"])

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/aeolian83/.cache/huggingface/token
Login successful


# 3. Dataset Load

In [3]:
from datasets import load_dataset, Dataset, DatasetDict
import pickle

In [4]:
with open('./data/train_data_300.pkl', 'rb') as file:
    train_data = pickle.load(file)
len(train_data)

with open('./data/validation_data_28.pkl', 'rb') as file:
    test_data = pickle.load(file)
len(test_data)

28

In [5]:
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

# DatasetDict로 "train"과 "test" 데이터셋 묶기
dataset_dict = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

In [6]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['english', 'korean', 'terms'],
        num_rows: 300
    })
    test: Dataset({
        features: ['english', 'korean', 'terms'],
        num_rows: 28
    })
})

In [7]:
dataset_dict["train"][0]

{'english': 'Explainable AI is becoming increasingly important as AI systems are integrated into various industries. With the rise of cloud computing, massive datasets can be processed and analyzed more efficiently, but this often comes at the cost of transparency. By combining explainable AI with cloud computing, organizations can ensure that their AI models are both powerful and understandable. Meanwhile, edge computing allows for data processing closer to the source, which can enhance real-time decision-making capabilities. Integrating explainable AI with edge computing can further improve the trustworthiness and reliability of these real-time systems.',
 'korean': '설명 가능한 AI(explainable AI)는 AI 시스템이 다양한 산업에 통합됨에 따라 점점 더 중요해지고 있습니다. 클라우드 컴퓨팅(cloud computing)의 발전으로 대규모 데이터셋을 더 효율적으로 처리하고 분석할 수 있지만, 이는 종종 투명성의 대가로 이루어집니다. 설명 가능한 AI(explainable AI)와 클라우드 컴퓨팅(cloud computing)을 결합하면 조직은 강력하면서도 이해할 수 있는 AI 모델을 보장할 수 있습니다. 한편, 엣지 컴퓨팅(edge computing)은 데이터 처리를 소스에 더 가깝게 하여 실시간 의사 결정 능력을 향상시킬

# 4. Loading the Model

In [8]:
model_id = "beomi/Llama-3-KoEn-8B"
device_map = {"": 0}
cache_model_dir="/mnt/t7/.cache/huggingface/models"

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [10]:
# Settings for 4-bit QLoRA Training(4bit QLoRA 학습을 위한 설정)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_compute_dtype=torch.bfloat16, # Nvidia의 Ampere 아키텍처 이후 가속기는 bf16으로 속도 향상을 꾀할수 있다. 
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# bnb_4bit_quant_type="nf4" 설정상 기본값은 bnb_4bit_quant_type="fp4"이나 허깅페이스 저자들에 의하면
# 경험적 결과로 "nf4"가 결과가 더 좋았다고 한다. https://huggingface.co/blog/4bit-transformers-bitsandbytes
# bnb_4bit_use_double_quant=True로 하면 매개변수단 0.4bit을 추가로 절약 할 수 있다고 한다. 

In [11]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map=device_map, cache_dir=cache_model_dir, trust_remote_code=True)
model.config.use_cache = False

# model.config.pretraining_tp = 1
# 종종 QLoRA 코드에 이 코드가 보이는데 병렬 학습에 쓰이는 코드로 보인다. 

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_model_dir)
tokenizer.padding_side = "right"
tokenizer.add_special_tokens({'pad_token': '<PAD>'})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [13]:
model.resize_token_embeddings(len(tokenizer)) # pad_token이 추가되었으므로 embedding과 language modeling head를 resize

Embedding(128257, 4096)

# 5. LoRA Setup

In [14]:
from peft import LoraConfig, get_peft_model

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

In [15]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

# 6. Formatting Dataset

In [16]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['english', 'korean', 'terms'],
        num_rows: 300
    })
    test: Dataset({
        features: ['english', 'korean', 'terms'],
        num_rows: 28
    })
})

In [17]:
dataset = dataset_dict['train']

In [18]:
dataset

Dataset({
    features: ['english', 'korean', 'terms'],
    num_rows: 300
})

In [19]:
# Formatting function
def formatting_func(example):
    output_texts = []
    for i in range(len(example["english"])):
        text = f"Translate input sentence to Korean \n### Input: {example['english'][i]} \n### Translated: {example['korean'][i]}" + tokenizer.eos_token
        output_texts.append(text)

    return output_texts


response_template = " \n### Translated:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# https://huggingface.co/docs/trl/sft_trainer#using-tokenids-directly-for-responsetemplate 참고


In [20]:
# response_template = " \n### Translated:"  # We added context here: "\n". This is enough for this tokenizer
# response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)  # Now we have it like in the dataset texts: `[2277, 29937, 4007, 22137, 29901]`

# # data_collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

# 7. Training Argument Setup

In [21]:
from transformers import TrainingArguments

In [22]:
checkpoint_dir = "./checkpoint/translate_machine_llama3ko_nonintsuct_origindata300_01"

In [23]:
output_dir = checkpoint_dir
per_device_train_batch_size = 1
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
report_to="wandb"
save_steps = 20
save_total_limit=5
num_train_epochs = 2
logging_steps = 20
learning_rate = 2e-4
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "constant"

In [24]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    logging_steps=logging_steps,
    report_to = report_to,
    learning_rate=learning_rate,
    bf16=True,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [25]:
from trl import SFTTrainer

max_seq_length = 1024

In [26]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_arguments,
    formatting_func=formatting_func,
    data_collator=collator,
)



Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [27]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

# 8. Training

In [28]:
# trainer.train()

In [29]:
trainer.train()

Step,Training Loss
20,0.3598
40,0.2539
60,0.2386
80,0.2261
100,0.2202
120,0.2201
140,0.2082
160,0.1837
180,0.146
200,0.1591


Checkpoint destination directory ./checkpoint/translate_machine_llama3ko_nonintsuct_origindata300_01/checkpoint-220 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./checkpoint/translate_machine_llama3ko_nonintsuct_origindata300_01/checkpoint-240 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./checkpoint/translate_machine_llama3ko_nonintsuct_origindata300_01/checkpoint-260 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./checkpoint/translate_machine_llama3ko_nonintsuct_origindata300_01/checkpoint-280 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./checkpoint/translate_machine_llama3ko_nonintsuct_origindata300_01/checkpoint-300 already exists and is non-empty. Saving will proceed but saved results m

TrainOutput(global_step=300, training_loss=0.1971247903505961, metrics={'train_runtime': 490.1479, 'train_samples_per_second': 1.224, 'train_steps_per_second': 0.612, 'total_flos': 9466790000295936.0, 'train_loss': 0.1971247903505961, 'epoch': 2.0})

In [30]:
lora_model_save_dir = "./translate_machine_llama3ko_nonintsuct_origindata300_01_right_padding"

In [31]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained(lora_model_save_dir, save_embedding_layers = True)

In [36]:
# lora_config = LoraConfig.from_pretrained(lora_model_save_dir)
# model = get_peft_model(model, lora_config)

In [37]:
# tokenizer.push_to_hub('aeolian83/llama_ko_sft_gugugo_experi_01')

CommitInfo(commit_url='https://huggingface.co/aeolian83/llama_ko_sft_gugugo_experi_01/commit/19dd71bb9c3aebf4c5be4ad2c4a15d34a7a999d6', commit_message='Upload tokenizer', commit_description='', oid='19dd71bb9c3aebf4c5be4ad2c4a15d34a7a999d6', pr_url=None, pr_revision=None, pr_num=None)

In [32]:
torch.cuda.empty_cache()

In [33]:
from peft import PeftModel

In [34]:
loaded_model = PeftModel.from_pretrained(
    model=model,
    model_id=lora_model_save_dir
)

In [35]:
dataset_dict['test']['english'][0]

'Group sparsity is a concept in multilinear algebra that promotes sparsity patterns within groups of variables. This technique is particularly useful in applications involving high-dimensional data, where it helps to identify relevant groups of features. In the context of factor graphs, group sparsity can enhance the efficiency of inference algorithms by reducing the complexity of the graph structure. Multilinear algebra provides the mathematical foundation for understanding and manipulating the interactions between these groups. By leveraging group sparsity and multilinear algebra, factor graphs can be optimized to handle large-scale problems more effectively.'

In [36]:
examples = [
    f'''
Translate input sentence to Korean
### Input: {dataset_dict['test']['english'][0]}
''',
    f'''
Translate input sentence to Korean
### Input: {dataset_dict['test']['english'][1]}
''',
 f'''
Translate input sentence to Korean
### Input: {dataset_dict['test']['english'][2]}
''']

In [37]:
example_batch = tokenizer(examples, return_tensors="pt", padding=True)['input_ids'].to(loaded_model.device)

In [38]:
with torch.cuda.amp.autocast():
    output_tokens = loaded_model.generate(example_batch, max_new_tokens = 1024, pad_token_id=tokenizer.pad_token_id)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [39]:
outputs = [tokenizer.decode(t, skip_special_tokens=True) for t in output_tokens]
for o in outputs:
    print(o)
    print('#'*100)


Translate input sentence to Korean
### Input: Group sparsity is a concept in multilinear algebra that promotes sparsity patterns within groups of variables. This technique is particularly useful in applications involving high-dimensional data, where it helps to identify relevant groups of features. In the context of factor graphs, group sparsity can enhance the efficiency of inference algorithms by reducing the complexity of the graph structure. Multilinear algebra provides the mathematical foundation for understanding and manipulating the interactions between these groups. By leveraging group sparsity and multilinear algebra, factor graphs can be optimized to handle large-scale problems more effectively.
제안: 그룹 희소성(group sparsity)은 변수 집합 내에서 희소성 패턴을 촉진하는 다변수 대수(multilinear algebra)의 개념입니다. 이 기술은 고차원 데이터를 다루는 응용 분야에서 특히 유용하며, 관련된 특징 집합을 식별하는 데 도움을 줍니다. 인자 그래프(factor graphs)에서 그룹 희소성(group sparsity)은 그래프 구조의 복잡성을 줄여 추론 알고리즘의 효율성을 향상시킬 수 있습니다. 다변수 대수(multilinear algebra)는 이러한 그룹 간의 상호작용

In [40]:
tokenizer.eos_token

'<|end_of_text|>'