# 1. Setup and Initialization

## 1.1 Importing Necessary Packages

In [1]:
# System
import wandb
import os
import json
import gc

# Environment
from dotenv import load_dotenv
from huggingface_hub import login

# LLM packages
import numpy as np
import torch
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    set_seed,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

# Set Seed
train_seed = 2002
set_seed(train_seed)

# 캐시 디렉토리 설정
DATA_CACHE_DIR = "/mnt/t7/.cache/huggingface/datasets"
MODEL_CACHE_DIR = "/mnt/t7/.cache/huggingface/models"

## 1.2 Logging into Hugging Face Hub and Weights & Biases

In [2]:
os.environ["WANDB_PROJECT"]="Graduate Project Llama-3.2-3B"
wandb.login()

load_dotenv("/mnt/t7/dnn/llm_practicing/.env")
login(token= os.environ["HF_TOKEN"])

[34m[1mwandb[0m: Currently logged in as: [33maeolian83[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# 2. Loading and Preparing the Dataset

In [3]:
dataset_name = "aeolian83/PTT_wit_Latex_1"

dataset_dict = load_dataset(dataset_name, cache_dir=DATA_CACHE_DIR)

In [4]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['term', 'english', 'korean'],
        num_rows: 1432
    })
})

In [5]:
shuffled_train = dataset_dict["train"].shuffle(seed=42)

dataset_dict = DatasetDict({
    "train": shuffled_train,
    # 다른 split도 있으면 추가 (예: "validation": dataset["validation"])
})

# 3. Hyperparameter Configuration

In [6]:
# Configring Base Model Load 
model_id = "meta-llama/Llama-3.2-3B"
device_map = {"": 0}

# Configuring Quantization
load_in_4bit = True
bnb_4bit_compute_dtype = torch.bfloat16
bnb_4bit_quant_type = "nf4"
bnb_4bit_use_double_quant = True

#Defining Training Arguments
checkpoint_dir = "/mnt/t7/dnn/paper_translator2/test/checkpoint/meta_Llama-3.2-3B_ft01"
output_dir = checkpoint_dir
per_device_train_batch_size = 4
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
report_to="wandb"
save_strategy="epoch"
num_train_epochs = 6
logging_steps = 20
# eval_steps=100,
learning_rate = 2e-4
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "cosine"
bf16 = True
group_by_length = True

# Configuring Lora
lora_r = 64
lora_alpha = 16
lora_dropout=0.1
target_modules='all-linear'
bias="none"
task_type="CAUSAL_LM"

# Configuring tokenizer
padding_side = "right"
response_template = "### Translated:"

# 4. Loading the Base Model for QLoRA

## 4.1 Loading the Model with QLoRA Configuration


In [7]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
)

In [8]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map=device_map, cache_dir=MODEL_CACHE_DIR, low_cpu_mem_usage=True, trust_remote_code=True,)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## 4.2 Loading the Tokenizer and Setting up Data Collator

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=MODEL_CACHE_DIR)
tokenizer.padding_side = "right"
tokenizer.pad_token = "<|finetune_right_pad_id|>"
tokenizer.pad_token_id = 128004

# tokenizer.padding_side = padding_side
response_template = response_template

data_collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# 5. Training the Model
## 5.1 Defining Training Arguments

In [10]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    num_train_epochs=num_train_epochs,
    save_strategy=save_strategy,
    logging_steps=logging_steps,
    # eval_steps=eval_steps,
    report_to = report_to,
    learning_rate=learning_rate,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    seed=train_seed
)

## 5.2 Defining PEFT Lora Configuration and Formatting Function

In [11]:
# LoRA Config
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=target_modules,
    bias=bias,
    task_type=task_type,
)

In [12]:
# Formatting function
def formatting_func(example):
    output_texts = []
    for i in range(len(example["english"])):
        text = f"Translate input sentence to Korean \n### Input: {example['english'][i]} \n### Translated: {example['korean'][i]}" + tokenizer.eos_token
        output_texts.append(text)

    return output_texts

In [13]:
# # 1) system_prompt: 교수님 역할만
# system_prompt = "You are a professor specializing in Physics, proficient in both Korean and English. Your task is to translate English physics content into Korean, adhering to specific guidelines."

# # 2) user_prompt: 가이드라인과 실제 입력을 함께
# user_prompt_template = """<번역 지침>
# 1. 매우 중요: 모든 전문 용어는 반드시 아래 형식으로 번역해야 합니다: 한글 용어(영문 용어). 예시: 적대적 훈련(adversarial training).
# 2. 약어는 다음과 같은 형식을 사용하세요: 한글 전체 용어(영문 전체 용어, 약어). 예시: 계층적으로 조직된 경량 다중 탐지 시스템(hierarchically organized light-weight multiple detector system, HOLMES).
# 3. 학술적인 문체를 유지하며, 기술적 정확성을 확보하세요.
# 4. 원문의 의미를 정확하게 반영하면서도 자연스러운 한국어 문장으로 번역하세요.
# 5. 번역 결과에 '*' 기호를 사용하지 마세요.
# 6. 한국어 문장 내 괄호 안의 모든 영문자는 소문자로 표기하세요.
# 7. 용어 및 괄호 번역 방식을 문장 전체에서 일관되게 유지하세요.
# 8. 수식이나 수학적 표현을 번역할 때는 국내 물리학 논문의 표준 표기법을 따르세요.
# </번역 지침>

# ## 출력 예시
# korean: 앙상블 학습(context of ensemble learning)에서 적응형 신경 프레임워크(adaptive neural frameworks)의 개발은 다양한 벤치마크 데이터셋(benchmark datasets)에서 광범위한 실험 결과로 입증된 바와 같이 심층 신경망(deep neural networks)의 성능을 크게 향상시킵니다. 이러한 적응형 신경 프레임워크(adaptive neural frameworks)를 활용함으로써 연구자들은 특징을 지능적으로 융합하여 더 차별화되고 효과적인 표현을 생성할 수 있으며, 이에 따라 모델의 일반화 능력을 향상시킬 수 있습니다. 결과적으로, 적응형 신경 프레임워크(adaptive neural frameworks)는 전통적인 특징 융합 기법(traditional feature fusion techniques)을 능가할 뿐만 아니라 이미지 분류(image classification), 객체 탐지(object detection), 자연어 처리(natural language processing, NLP), 그래프 기반 학습(graph-based learning) 작업을 포함한 여러 도메인에서 광범위한 적용 가능성을 보여줍니다.

# ## 출력 형식
# korean: 형식에 맞게 용어를 용어번역과 괄호 원문을 배칙하고, 잘 번역된 문장들

# 참고: 반드시 한국어 번역문만 출력하세요. 원본 영어 문장은 포함하지 마세요.

# Input: {english}
# """

In [14]:
# # Formatting function
# def formatting_func(example):
#     formatted_texts = []
#     for eng, kor in zip(example["english"], example["korean"]):
#         # SYSTEM 메시지
#         convo = f"[SYSTEM]\n{system_prompt}\n\n"
#         # USER 메시지: guideline + 실제 입력
#         user_prompt = user_prompt_template.format(english=eng)
#         convo   += f"[USER]\n{user_prompt}\n"
#         # ASSISTANT 메시지: 정답 + EOS
#         convo   += f"[ASSISTANT]\nkorean: {kor}{tokenizer.eos_token}"
#         formatted_texts.append(convo)
#     return formatted_texts

## 5.3 Setting up Trainer with SFTTrainer

In [15]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    peft_config=peft_config,
    args=training_arguments,
    train_dataset=dataset_dict["train"],
    formatting_func=formatting_func,
    data_collator=data_collator,
)

  trainer = SFTTrainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
trainer.train()
trainer.save_model(os.path.join(training_arguments.output_dir, "last_checkpoint"))



Step,Training Loss
20,0.9163
40,0.6347
60,0.5585
80,0.4592
100,0.4555
120,0.4284
140,0.4019
160,0.3757
180,0.3705
200,0.3078


## Model Merge and Upload

In [16]:
# del(trainer)
# gc.collect()
# gc.collect()
# torch.cuda.empty_cache()
# torch.cuda.ipc_collect()

In [17]:
model_id = "meta-llama/Llama-3.2-3B"
device_map = {"": 0}
checkpoint_dir = "/mnt/t7/dnn/paper_translator2/test/checkpoint/meta_Llama-3.2-3B_ft01/last_checkpoint"


In [18]:
model = AutoModelForCausalLM.from_pretrained(
            model_id, 
            torch_dtype=torch.bfloat16, 
            load_in_8bit=False, 
            device_map=device_map, 
            trust_remote_code=True, 
            cache_dir=MODEL_CACHE_DIR)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
peft_model = PeftModel.from_pretrained(model, checkpoint_dir)

In [20]:
model = peft_model.merge_and_unload()

In [21]:
model.push_to_hub('aeolian83/meta_Llama-3.2-3B_translator01')

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aeolian83/meta_Llama-3.2-3B_translator01/commit/677f5e6b44128dd1508b812b74f6da6d931beb5c', commit_message='Upload LlamaForCausalLM', commit_description='', oid='677f5e6b44128dd1508b812b74f6da6d931beb5c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aeolian83/meta_Llama-3.2-3B_translator01', endpoint='https://huggingface.co', repo_type='model', repo_id='aeolian83/meta_Llama-3.2-3B_translator01'), pr_revision=None, pr_num=None)

In [22]:
tokenizer.push_to_hub('aeolian83/meta_Llama-3.2-3B_translator01')

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aeolian83/meta_Llama-3.2-3B_translator01/commit/eb188dc351ad4bcc1d72abf02237a1df6ef1d38e', commit_message='Upload tokenizer', commit_description='', oid='eb188dc351ad4bcc1d72abf02237a1df6ef1d38e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aeolian83/meta_Llama-3.2-3B_translator01', endpoint='https://huggingface.co', repo_type='model', repo_id='aeolian83/meta_Llama-3.2-3B_translator01'), pr_revision=None, pr_num=None)