In [2]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, AutoConfig
from accelerate import Accelerator, infer_auto_device_map

LLAMA_PATH = 'model/dpo-240820-ep-line-merged'

# 4bit 퀀타이제이션 설정
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_storage=torch.bfloat16,
        llm_int8_enable_fp32_cpu_offload=True  # CPU 오프로딩 활성화
)
# 각 GPU에 최대 메모리 설정 (예시로 40GB씩 할당)
max_memory = {i: "40GB" for i in range(torch.cuda.device_count())}

In [3]:
# 모델의 설정을 먼저 불러오기
config = AutoConfig.from_pretrained(LLAMA_PATH)

In [4]:
from transformers.modeling_utils import init_empty_weights  # ✅ 올바른 import

# 아직 모델을 선언하지 않은 상태에서 `config`를 기반으로 빈 모델 생성
# 🚀 가중치를 전혀 로드하지 않는 완전 빈(empty) 모델 생성
with init_empty_weights():
    empty_model = LlamaForCausalLM(config)
print("빈 모델 선언 완료")


빈 모델 선언 완료


In [5]:
num_gpus = torch.cuda.device_count()

# **🚀 GPU 8대에 균등하게 분배하는 수동 device_map 생성**
device_map = {}

# 임베딩 & 출력 레이어는 GPU 0에 배치
device_map["model.embed_tokens"] = 0
device_map["lm_head"] = 0

# 126개 LlamaDecoderLayer를 8개의 GPU에 균등 분배
for i, layer in enumerate(empty_model.model.layers):
    assigned_gpu = i % num_gpus  # GPU 인덱스 0부터 7까지 순차적으로 할당
    device_map[f"model.layers.{i}"] = assigned_gpu

# 마지막 RMSNorm도 마지막 GPU로 배치
device_map["model.norm"] = 7
device_map["model.rotary_emb"] = 7  # Rotary Embedding도 마지막 GPU로

In [6]:
# 모델 로드 (`device_map` 적용)
model = LlamaForCausalLM.from_pretrained(
    LLAMA_PATH,
    quantization_config=bnb_config,
    device_map=device_map,  # 자동 분배된 device_map 적용
    attn_implementation="flash_attention_2"
)


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████| 191/191 [20:57<00:00,  6.58s/it]


In [7]:
from transformers import AutoTokenizer

TOKENIZER_PATH = 'model/dpo-240820-ep-line-merged'

# Tokenizer 로드
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
tokenizer.pad_token = tokenizer.eos_token
# A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
tokenizer.padding_side = "left"



In [8]:
model.save_pretrained('model/llama_405b_quantized_bfloat16')
tokenizer.save_pretrained('model/llama_405b_quantized_bfloat16')

[2025-03-24 01:25:38,301] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlvsym'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'
collect2: error: ld returned 1 exit status


('model/llama_405b_quantized_uint8/tokenizer_config.json',
 'model/llama_405b_quantized_uint8/special_tokens_map.json',
 'model/llama_405b_quantized_uint8/tokenizer.json')

In [9]:
from huggingface_hub import login

# Hugging Face 계정에서 받은 API 토큰을 입력
login(token="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")

In [7]:
import re
def instruct_structure(prompt,system_prompt=
                       """You're an expert translator who translates Korean webtoon in English. Make sure the number of target sentences matches the number of source sentences. The result should be TSV formatted. 
    • Find a balance between staying true to the Korean meaning and keeping a natural flow. Don't be afraid to add to the text. Embellish it. 
    • Avoid translating word-for-word. Keep the general feeling and translate the text accordingly. 
    • Translate with an American audience in mind. This means easy-to-read, conversational English."""):
    input_text, output_text = prompt.split('### target')
    input_text = input_text.replace('### glossaries', '### glossary').replace('\n* ', '\n• ')
    input_text = re.sub(r"\[[^\]]+\] ", "[UNK] ", input_text)
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>
{input_text.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

In [8]:
project_id = "prod-ai-project"

from google.cloud import bigquery
client = bigquery.Client(project=project_id)
sql = """select series_id, episode_id, org_input_text, org_output_text, prompt 
        from webtoon_translation.structured_240820_ep_line
        where data_split = 'romance_valid'"""
df = client.query(sql).result().to_dataframe()
from tqdm import tqdm
tqdm.pandas()
df['prompt'] = df['prompt'].progress_apply(lambda x: instruct_structure(x))

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<00:00, 17618.12it/s]


In [9]:
print(df['prompt'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You're an expert translator who translates Korean webtoon in English. Make sure the number of target sentences matches the number of source sentences. The result should be TSV formatted. 
    • Find a balance between staying true to the Korean meaning and keeping a natural flow. Don't be afraid to add to the text. Embellish it. 
    • Avoid translating word-for-word. Keep the general feeling and translate the text accordingly. 
    • Translate with an American audience in mind. This means easy-to-read, conversational English.<|eot_id|><|start_header_id|>user<|end_header_id|>
### glossary
• 아벨 헤일론 (M): abel heylon
• 아벨 (M): abel
• 피오나 (F): fiona
• 마법: magic / spell

### source
000	[UNK] 북부 최전방 헤일론
001	[UNK] 여기가 헤일론 공작 성….
002	[UNK] 엄청난 위압감이야…
003	[UNK] 나는 결국 가족에게 떠밀려 무자비한 공작이 다스린다는 북부 최전방에 왔다.
004	[UNK] 딱 봐도 마왕성 같은 게 사람 엄청 굴릴 것 같다.
005	[UNK] 내 무덤을 내가 팠지...
006	[UNK] 이쪽으로.
007	[UNK] 문을 열어라.
008	[UNK] 문이, 엄청 커…!!
009	[UNK] 북부의 공작

In [10]:
sample = df['prompt'][0]

# 입력 변환 및 토큰화
inputs = tokenizer(sample, return_tensors="pt").to(model.device)
import time
start_time = time.time()
# 모델 추론
with torch.no_grad():
    output = model.generate(**inputs, 
                            max_new_tokens=1000,#입력 길이와 무관하게 출력길이만 제한
                            do_sample=True,
                            temperature=0.1,
                            top_p=0.9,
                            top_k=30,
                            repetition_penalty=1.2,
                            use_cache=True #kv cache
                           )

# 결과 출력

response = tokenizer.decode(output[0], skip_special_tokens=False)
print(response)
print(f"{time.time()-start_time:.2f}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>
You're an expert translator who translates Korean webtoon in English. Make sure the number of target sentences matches the number of source sentences. The result should be TSV formatted. 
    • Find a balance between staying true to the Korean meaning and keeping a natural flow. Don't be afraid to add to the text. Embellish it. 
    • Avoid translating word-for-word. Keep the general feeling and translate the text accordingly. 
    • Translate with an American audience in mind. This means easy-to-read, conversational English.<|eot_id|><|start_header_id|>user<|end_header_id|>
### glossary
• 아벨 헤일론 (M): abel heylon
• 아벨 (M): abel
• 피오나 (F): fiona
• 마법: magic / spell

### source
000	[UNK] 북부 최전방 헤일론
001	[UNK] 여기가 헤일론 공작 성….
002	[UNK] 엄청난 위압감이야…
003	[UNK] 나는 결국 가족에게 떠밀려 무자비한 공작이 다스린다는 북부 최전방에 왔다.
004	[UNK] 딱 봐도 마왕성 같은 게 사람 엄청 굴릴 것 같다.
005	[UNK] 내 무덤을 내가 팠지...
006	[UNK] 이쪽으로.
007	[UNK] 문을 열어라.
008	[UNK] 문이, 엄청 커…!!