# 1. Weight and Bias Login

In [1]:
import wandb
import os
os.environ["WANDB_PROJECT"]="Translate_prac_02"

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maeolian83[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# 2. Login Huggingface

In [2]:
from huggingface_hub import login
from dotenv import load_dotenv

load_dotenv()


login(token= os.environ["HF_TOKEN"])

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/aeolian83/.cache/huggingface/token
Login successful


# 3. Dataset Load

In [3]:
from datasets import load_dataset, Dataset, DatasetDict
import pickle

In [4]:
with open('./data/technical_sentences_gpt_1039.pickle', 'rb') as file:
    inputs = pickle.load(file)
len(inputs)

1039

In [5]:
with open('./data/translated_sentences_gpt_1039.pickle', 'rb') as file:
    outputs = pickle.load(file)
len(outputs)

1039

In [6]:
# Create a dictionary with your data
data = {
    "Input": inputs,
    "Translated": outputs
}

# Create the DatasetDict
dataset = DatasetDict({"train": Dataset.from_dict(data)})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Input', 'Translated'],
        num_rows: 1039
    })
})


# 4. Loading the Model

In [7]:
model_id = "beomi/Llama-3-KoEn-8B-Instruct-preview"
device_map = {"": 0}
cache_model_dir="/mnt/t7/.cache/huggingface/models"

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [9]:
# Settings for 4-bit QLoRA Training(4bit QLoRA 학습을 위한 설정)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_compute_dtype=torch.bfloat16, # Nvidia의 Ampere 아키텍처 이후 가속기는 bf16으로 속도 향상을 꾀할수 있다. 
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# bnb_4bit_quant_type="nf4" 설정상 기본값은 bnb_4bit_quant_type="fp4"이나 허깅페이스 저자들에 의하면
# 경험적 결과로 "nf4"가 결과가 더 좋았다고 한다. https://huggingface.co/blog/4bit-transformers-bitsandbytes
# bnb_4bit_use_double_quant=True로 하면 매개변수단 0.4bit을 추가로 절약 할 수 있다고 한다. 

In [10]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map=device_map, cache_dir=cache_model_dir, trust_remote_code=True)
model.config.use_cache = False

# model.config.pretraining_tp = 1
# 종종 QLoRA 코드에 이 코드가 보이는데 병렬 학습에 쓰이는 코드로 보인다. 

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_model_dir)
tokenizer.add_special_tokens({'pad_token': '<PAD>'})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [12]:
# 이 코드를 쓰지 않는 경우(물론 패딩 토큰을 별도로 사용하는 경우에 해당됨) loss가 0으로 떨어지는 경우가 있다함
tokenizer.padding_side = "left"

In [13]:
model.resize_token_embeddings(len(tokenizer)) # pad_token이 추가되었으므로 embedding과 language modeling head를 resize

Embedding(128257, 4096)

# 5. LoRA Setup

In [14]:
from peft import LoraConfig, get_peft_model

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

In [15]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Input', 'Translated'],
        num_rows: 1039
    })
})

# 6. Formatting Dataset

In [21]:
def format_instruction(sample):
    system_prompt = f"### system prompt: Translate the following English text related to Computer Science into Korean."
    input = f"### Input: {sample['Input']}" if len(sample["Input"]) > 0 else None
    output = f"### output: {sample['Translated']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [system_prompt, input, output] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_instruction(sample)}{tokenizer.eos_token}"
    return sample

In [23]:
train_dataset = dataset['train'].map(template_dataset, remove_columns=list(dataset['train'].features), num_proc=10)

Map (num_proc=10):   0%|          | 0/1039 [00:00<?, ? examples/s]

In [24]:
train_dataset["text"][100]

'### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).\n\n### Input: Graph neural networks (GNNs) have emerged as powerful tools for learning on graph-structured data, with applications ranging from social network analysis to molecular biology.\n\n### output: 그래프 신경망(Graph Neural Networks, GNNs)은 소셜 네트워크 분석에서 분자 생물학에 이르기까지 다양한 응용 분야에서 그래프 구조화된 데이터 학습에 강력한 도구로 떠오르고 있습니다.<|end_of_text|>'

# 7. Training Argument Setup

In [25]:
from transformers import TrainingArguments

In [26]:
checkpoint_dir = "./checkpoint/translate_machine_llama3_01"

In [27]:
output_dir = checkpoint_dir
per_device_train_batch_size = 1
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
report_to="wandb"
save_steps = 20
save_total_limit=5
num_train_epochs = 2
logging_steps = 20
learning_rate = 2e-4
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "constant"

In [28]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    logging_steps=logging_steps,
    report_to = report_to,
    learning_rate=learning_rate,
    bf16=True,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [29]:
from trl import SFTTrainer

max_seq_length = 1024

In [30]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    # max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)



Map:   0%|          | 0/1039 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [31]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

# 8. Training

In [32]:
# trainer.train()

In [34]:
trainer.train()

Step,Training Loss
20,1.507
40,0.6963
60,0.6781
80,0.6297
100,0.5752
120,0.5954
140,0.5484
160,0.5633
180,0.4696
200,0.4741


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]



TrainOutput(global_step=1038, training_loss=0.45145035754738516, metrics={'train_runtime': 1276.3885, 'train_samples_per_second': 1.628, 'train_steps_per_second': 0.813, 'total_flos': 1.0469490429640704e+16, 'train_loss': 0.45145035754738516, 'epoch': 2.0})

In [36]:
lora_model_save_dir = "./results/translate_machine_llama3_01_2epoch"

In [37]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained(lora_model_save_dir, save_embedding_layers = True)

In [36]:
# lora_config = LoraConfig.from_pretrained(lora_model_save_dir)
# model = get_peft_model(model, lora_config)

In [37]:
# tokenizer.push_to_hub('aeolian83/llama_ko_sft_gugugo_experi_01')

CommitInfo(commit_url='https://huggingface.co/aeolian83/llama_ko_sft_gugugo_experi_01/commit/19dd71bb9c3aebf4c5be4ad2c4a15d34a7a999d6', commit_message='Upload tokenizer', commit_description='', oid='19dd71bb9c3aebf4c5be4ad2c4a15d34a7a999d6', pr_url=None, pr_revision=None, pr_num=None)

In [38]:
torch.cuda.empty_cache()

In [40]:
from peft import PeftModel

In [41]:
loaded_model = PeftModel.from_pretrained(
    model=model,
    model_id=lora_model_save_dir
)

In [47]:
examples = [
    '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: Despite their sample quality, our models do not have competitive log likelihoods compared to other likelihood-based models.
''',
    '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: Our models do, however, have log likelihoods better than the large estimates annealed importance sampling has been reported to produce for energy based models and score matching.
''',
 '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: We focus on Latent Diffusion Models since they can perform a wide range of generative tasks. This work shows that simply fine-tuning a small part of the generative model.
''']

In [48]:
example_batch = tokenizer(examples, return_tensors="pt", padding=True)['input_ids'].to(loaded_model.device)

In [49]:
with torch.cuda.amp.autocast():
    output_tokens = loaded_model.generate(example_batch, max_new_tokens = 1024, pad_token_id=tokenizer.pad_token_id)

In [50]:
outputs = [tokenizer.decode(t, skip_special_tokens=True) for t in output_tokens]
for o in outputs:
    print(o)
    print('#'*100)


### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: Despite their sample quality, our models do not have competitive log likelihoods compared to other likelihood-based models.
### output: 샘플 품질에도 불구하고, 우리의 모델은 다른 가능성 기반 모델과 비교하여 경쟁력 있는 로그 가능성(log likelihood)을 갖지 않습니다.
####################################################################################################

### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: Our models do, however, have log likelihoods better than the large estimates annealed importance sampling has been reported to produce for energy based models and score matching.
### output: 우리의 모델은 그러나 에너지 기반 모델과 점수 매칭에서 대규모 추정치가 중요 

In [53]:
examples = [
    '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: Large Language Models (LLM) represent the most recent advances in Natural Language Processing (NLP) demonstrating a wide range of capabilities in language processing [Zhao et al.(2023)]. They came into prominence after ChatGPT, an application by OpenAI that opened for public testing, went vira This has fueled attempts to use LLMs for a variety of applications ranging from creative writing [Gómez-Rodríguez and Williams(2023)], to programming [Liventsev et al.(2023)], legal [Louis et al.(2023)] and medical [He et al.(2023)] domains which require greater factual accuracy.
''',
    '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: A promising area of application for LLMs is question answering over proprietary organizational documents such as governance/policy manuals. Such documents are often a regular point of reference as they guide the day-to-day operations and decision making within an organization. This results in frequent references to such documents or to experts within the organization who respond to queries about such information. Hence there is potential for increased efficiency from having an application that can respond to a diverse range of user queries based on organizational documents.
''',
 '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: There are several considerations when deploying an LLM application in such settings. One major concern is the security risks given the confidential nature of such documents. As a result, it is not possible to use proprietary LLM models over an API due to data leakage risk $2^{2}$ This necessitates the use of open source models that can be deployed on-premise. A second concern is limited computational resources as well as relatively smaller training datasets that can be generated based on the available documents. Finally, any such application must be able to reliably and correctly respond to[^0]user queries. Therefore, deploying a robust application in such settings is not trivial, requiring many decisions and customization.
''']

In [54]:
example_batch = tokenizer(examples, return_tensors="pt", padding=True)['input_ids'].to(loaded_model.device)

In [56]:
with torch.cuda.amp.autocast():
    output_tokens = loaded_model.generate(example_batch, max_new_tokens = 2048, pad_token_id=tokenizer.pad_token_id)

In [57]:
outputs = [tokenizer.decode(t, skip_special_tokens=True) for t in output_tokens]
for o in outputs:
    print(o)
    print('#'*100)


### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: Large Language Models (LLM) represent the most recent advances in Natural Language Processing (NLP) demonstrating a wide range of capabilities in language processing [Zhao et al.(2023)]. They came into prominence after ChatGPT, an application by OpenAI that opened for public testing, went vira This has fueled attempts to use LLMs for a variety of applications ranging from creative writing [Gómez-Rodríguez and Williams(2023)], to programming [Liventsev et al.(2023)], legal [Louis et al.(2023)] and medical [He et al.(2023)] domains which require greater factual accuracy.
### output: 대규모 언어 모델(Large Language Models, LLMs)은 자연어 처리(Natural Language Processing, NLP)에서 가장 최근의 발전으로 언어 처리에서 다양한 능력을 보여줍니다. [Zhao et al.(2023)] ChatGPT의 공개 테스트가 시작된 후 LLMs는 창의적 글쓰기 [Gómez-Rodríg

In [58]:
examples = [
    '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: Retrieval-Augmented Generation (RAG) enhances the performance of LLMs on domain specific tasks by providing the model with an external source of information. While there are many variations, we provide an overview of a typical RAG application in Algorithm 1. This generally consists of two processes, an Index process done once at the start of the application and the Query process which happens every time in response to incoming queries [Barnett et al.(2024)]. The index process occurs as follows. The input document $D$ is split into discrete chunks $\left\{c_{1}, c_{2}, \ldots, c_{n}\right\}$ (steps $2 \& 3$ ). Using an encoder model, the split chunks $c_{i}$ are converted to embedding vectors $\vec{d}_{i}=\operatorname{encoder}\left(c_{i}\right)$ (step 4) which are then stored in a vector database (step 5). This database is later used to retrieve relevant chunks for a given query.
''',
    '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: The Query processing happens in response to incoming user queries. For a given query $q$, the encoding model is used to create a vector embedding of the query $\vec{v}=\operatorname{encoder}(q)$. The database is then searched to find the top $k$ chunk embeddings $\left\{\overrightarrow{d_{1}}, \overrightarrow{d_{2}}, \ldots, \overrightarrow{d_{k}}\right\}$ that are similar to the query embedding $\vec{v}$. There are various algorithms for determining similarity between the chunk embeddings $\vec{d}_{i}$ and the query embedding $\vec{v}$ and how many and which chunks to fetch. The top $k$ chunks $\left\{c_{1}, c_{2}, \ldots, c_{k}\right\}$ retrieved from the database, along with the query, are then passed into the prompt template. The completed prompt is then input to an LLM model which generates an output based on the provided information. This response is then returned to the user.
''',
 '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: The overall workflow of our system, Tree-RAG (T-RAG), is shown in Figure 1 and outlined in Algorithm 2. Our system differs from the typical RAG application in the Query process. Instead of using an existing pre-trained LLM, we use a finetuned version of the LLM for answer generation; we finetuned the LLM model on an instruction dataset of questions and answers generated based on the organization's document as described in later sections.
''']

In [59]:
example_batch = tokenizer(examples, return_tensors="pt", padding=True)['input_ids'].to(loaded_model.device)

with torch.cuda.amp.autocast():
    output_tokens = loaded_model.generate(example_batch, max_new_tokens = 2048, pad_token_id=tokenizer.pad_token_id)

outputs = [tokenizer.decode(t, skip_special_tokens=True) for t in output_tokens]
for o in outputs:
    print(o)
    print('#'*100)


### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
ight)$ (step 4) which are then stored in a vector database (step 5). This database is later used to retrieve relevant chunks for a given query.r}\left(c_{i}information. While there are many variations, we provide an overview of a typical RAG application in Algorithm 1. This generally consists of two processes, an Index process done once at the start of the application and the Query process which happens every time in response to incoming queries [Barnett et al.(2024)]. The index process occurs as follows. The input document $D$ is split into discrete chunks $\left\{c_{1}, c_{2}, \ldots, c_{n}
### output: 검색 증강 생성(Retrieval-Augmented Generation, RAG)은 도메인 특화 작업에서 LLM의 성능을 향상시키기 위해 외부 정보원을 모델에 제공합니다. 많은 변형이 있지만, 알고리즘 1의 전형적인 RAG 응용 프로그램에 대한 개요를 제공합니다. 일반적으로 이 응용 프로그램은 두 가지 프로세스로

In [60]:
examples = [
    '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: A feature of T-RAG is the inclusion of an entities tree in addition to the vector database for context retrieval. The entities tree holds information about entities in the organization and their location within the hierarchy. Each node in this tree represents an entity with the parent node indicating the group it belongs to. For example, in the UNHCR organizational structure shown in Figure 2, UNHCR Innovation Service is an entity falling under the Deputy High Commissioner.
''',
    '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: During retrieval, we use the entities tree to further augment the context retrieved by the vector database. The entity tree search and context generation occurs as follows. A parser module searches the user query for keywords matching the names of entities in the organization. If one or more matches are found, information about each matched entity is extracted from the tree and converted into a textual statement providing information about the entity and its location within the organization's hierarchy. This information is then combined with the document chunks retrieved from the vector database to form the context. This allows the model to access information about entities and their location within the organization's hierarchy when users ask questions about these entities.
''',
 '''
### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: The overall workflow of our system, Tree-RAG (T-RAG), is shown in Figure 1 and outlined in Algorithm 2. Our system differs from the typical RAG application in the Query process. Instead of using an existing pre-trained LLM, we use a finetuned version of the LLM for answer generation; we finetuned the LLM model on an instruction dataset of questions and answers generated based on the organization's document as described in later sections.
''']

In [61]:
example_batch = tokenizer(examples, return_tensors="pt", padding=True)['input_ids'].to(loaded_model.device)

with torch.cuda.amp.autocast():
    output_tokens = loaded_model.generate(example_batch, max_new_tokens = 2048, pad_token_id=tokenizer.pad_token_id)

outputs = [tokenizer.decode(t, skip_special_tokens=True) for t in output_tokens]
for o in outputs:
    print(o)
    print('#'*100)


### system prompt: Translate the following English text related to Computer Science into Korean. When translating, for Computer Science terms, translate them in the format: Korean translation (English original).
### Input: A feature of T-RAG is the inclusion of an entities tree in addition to the vector database for context retrieval. The entities tree holds information about entities in the organization and their location within the hierarchy. Each node in this tree represents an entity with the parent node indicating the group it belongs to. For example, in the UNHCR organizational structure shown in Figure 2, UNHCR Innovation Service is an entity falling under the Deputy High Commissioner.
### output: T-RAG의 특징은 벡터 데이터베이스 외에 콘텍스트 검색을 위한 엔티티 트리(Entities Tree)를 포함하는 것입니다. 엔티티 트리는 조직 내 엔티티의 위치와 계층 구조를 포함하는 정보를 저장합니다. 이 나무의 각 노드는 부모 노드가 속한 그룹을 나타내는 엔티티를 나타냅니다. 예를 들어, 도 2의 UNHCR 조직 구조에서 UNHCR Innovation Service는 부총재(Deputy High Commissioner) 그룹에 속하는 엔티티입니다.
#############################