In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, AutoConfig
from accelerate import Accelerator, infer_auto_device_map

LLAMA_PATH = 'model/llama_405b_quantized'

# 4bit 퀀타이제이션 설정
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_storage=torch.bfloat16,
        llm_int8_enable_fp32_cpu_offload=True  # CPU 오프로딩 활성화
)
# 각 GPU에 최대 메모리 설정 (예시로 40GB씩 할당)
max_memory = {i: "40GB" for i in range(torch.cuda.device_count())}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 모델의 설정을 먼저 불러오기
config = AutoConfig.from_pretrained(LLAMA_PATH)

In [3]:
from transformers.modeling_utils import init_empty_weights  # ✅ 올바른 import

# 아직 모델을 선언하지 않은 상태에서 `config`를 기반으로 빈 모델 생성
# 🚀 가중치를 전혀 로드하지 않는 완전 빈(empty) 모델 생성
with init_empty_weights():
    empty_model = LlamaForCausalLM(config)
print("빈 모델 선언 완료")


빈 모델 선언 완료


In [4]:
num_gpus = torch.cuda.device_count()

# **🚀 GPU 8대에 균등하게 분배하는 수동 device_map 생성**
device_map = {}

# 임베딩 & 출력 레이어는 GPU 0에 배치
device_map["model.embed_tokens"] = 7
device_map["lm_head"] = 7

# 126개 LlamaDecoderLayer를 8개의 GPU에 균등 분배
for i, layer in enumerate(empty_model.model.layers):
    assigned_gpu = i % num_gpus  # GPU 인덱스 0부터 7까지 순차적으로 할당
    device_map[f"model.layers.{i}"] = assigned_gpu

# 마지막 RMSNorm도 마지막 GPU로 배치
device_map["model.norm"] = 7
device_map["model.rotary_emb"] = 7  # Rotary Embedding도 마지막 GPU로

In [5]:
print(device_map)

{'model.embed_tokens': 7, 'lm_head': 7, 'model.layers.0': 0, 'model.layers.1': 1, 'model.layers.2': 2, 'model.layers.3': 3, 'model.layers.4': 4, 'model.layers.5': 5, 'model.layers.6': 6, 'model.layers.7': 7, 'model.layers.8': 0, 'model.layers.9': 1, 'model.layers.10': 2, 'model.layers.11': 3, 'model.layers.12': 4, 'model.layers.13': 5, 'model.layers.14': 6, 'model.layers.15': 7, 'model.layers.16': 0, 'model.layers.17': 1, 'model.layers.18': 2, 'model.layers.19': 3, 'model.layers.20': 4, 'model.layers.21': 5, 'model.layers.22': 6, 'model.layers.23': 7, 'model.layers.24': 0, 'model.layers.25': 1, 'model.layers.26': 2, 'model.layers.27': 3, 'model.layers.28': 4, 'model.layers.29': 5, 'model.layers.30': 6, 'model.layers.31': 7, 'model.layers.32': 0, 'model.layers.33': 1, 'model.layers.34': 2, 'model.layers.35': 3, 'model.layers.36': 4, 'model.layers.37': 5, 'model.layers.38': 6, 'model.layers.39': 7, 'model.layers.40': 0, 'model.layers.41': 1, 'model.layers.42': 2, 'model.layers.43': 3, 'm

In [6]:
# 모델 로드 (`device_map` 적용)
model = LlamaForCausalLM.from_pretrained(
    LLAMA_PATH,
    quantization_config=bnb_config,
    device_map=device_map,  # 자동 분배된 device_map 적용
    #attn_implementation="flash_attention_2"
)


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████| 44/44 [01:14<00:00,  1.70s/it]


In [7]:
print(model)
print(model.device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 16384)
    (layers): ModuleList(
      (0-125): 126 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=16384, out_features=16384, bias=False)
          (k_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (v_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (o_proj): Linear4bit(in_features=16384, out_features=16384, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=16384, out_features=53248, bias=False)
          (up_proj): Linear4bit(in_features=16384, out_features=53248, bias=False)
          (down_proj): Linear4bit(in_features=53248, out_features=16384, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((16384,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((16384,), eps=1e-05)
      )
    )
    (norm): Lla

In [8]:
#model.save_pretrained('model/llama_405b_quantized')

In [9]:
from transformers import AutoTokenizer

TOKENIZER_PATH = 'model/llama_405b_quantized'

# Tokenizer 로드
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
tokenizer.pad_token = tokenizer.eos_token
# A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
tokenizer.padding_side = "left"

#tokenizer.save_pretrained('model/llama_405b_quantized')

In [10]:
import re
def instruct_structure(prompt,system_prompt=
                       """You're an expert translator who translates Korean webtoon in English. Make sure the number of target sentences matches the number of source sentences. The result should be TSV formatted. 
    • Find a balance between staying true to the Korean meaning and keeping a natural flow. Don't be afraid to add to the text. Embellish it. 
    • Avoid translating word-for-word. Keep the general feeling and translate the text accordingly. 
    • Translate with an American audience in mind. This means easy-to-read, conversational English."""):
    input_text, output_text = prompt.split('### target')
    input_text = input_text.replace('### glossaries', '### glossary').replace('\n* ', '\n• ')
    input_text = re.sub(r"\[[^\]]+\] ", "none ", input_text)
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>
    {input_text.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

In [11]:

project_id = "prod-ai-project"

from google.cloud import bigquery
client = bigquery.Client(project=project_id)
sql = """select series_id, episode_id, org_input_text, org_output_text, prompt 
        from webtoon_translation.structured_240820_ep_line
        where data_split = 'romance_valid'"""
df = client.query(sql).result().to_dataframe()
from tqdm import tqdm
tqdm.pandas()
df['prompt'] = df['prompt'].progress_apply(lambda x: instruct_structure(x))

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<00:00, 17682.36it/s]


In [12]:
print(df['prompt'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You're an expert translator who translates Korean webtoon in English. Make sure the number of target sentences matches the number of source sentences. The result should be TSV formatted. 
    • Find a balance between staying true to the Korean meaning and keeping a natural flow. Don't be afraid to add to the text. Embellish it. 
    • Avoid translating word-for-word. Keep the general feeling and translate the text accordingly. 
    • Translate with an American audience in mind. This means easy-to-read, conversational English.<|eot_id|><|start_header_id|>user<|end_header_id|>
    ### glossary
• 아벨 헤일론 (M): abel heylon
• 아벨 (M): abel
• 피오나 (F): fiona
• 마법: magic / spell

### source
000	none 북부 최전방 헤일론
001	none 여기가 헤일론 공작 성….
002	none 엄청난 위압감이야…
003	none 나는 결국 가족에게 떠밀려 무자비한 공작이 다스린다는 북부 최전방에 왔다.
004	none 딱 봐도 마왕성 같은 게 사람 엄청 굴릴 것 같다.
005	none 내 무덤을 내가 팠지...
006	none 이쪽으로.
007	none 문을 열어라.
008	none 문이, 엄청 커…!!
009	none 북부의 공작, 

In [13]:
sample = df['prompt'][0]

# 입력 변환 및 토큰화
inputs = tokenizer(sample, return_tensors="pt").to("cuda")
print(inputs)

{'input_ids': tensor([[128000, 128000, 128006,   9125, 128007,    198,    262,   1472,   2351,
            459,   6335,  46588,    889,  48018,  16526,   3566,    998,    263,
            304,   6498,     13,   7557,   2771,    279,   1396,    315,   2218,
          23719,   9248,    279,   1396,    315,   2592,  23719,     13,    578,
           1121,   1288,    387,    350,  18282,  24001,     13,    720,    262,
           7436,   7531,    264,   8335,   1990,  19994,    837,    311,    279,
          16526,   7438,    323,  10494,    264,   5933,   6530,     13,   4418,
            956,    387,  16984,    311,    923,    311,    279,   1495,     13,
          30227,    616,    819,    433,     13,    720,    262,   7436,  35106,
          67371,   3492,  15548,  38428,     13,  13969,    279,   4689,   8430,
            323,  15025,    279,   1495,  28178,     13,    720,    262,   7436,
          38840,    449,    459,   3778,  10877,    304,   4059,     13,   1115,
           344

In [14]:

# 모델 추론
with torch.no_grad():
    output = model.generate(**inputs, 
                            max_length=1000,
                            repetition_penalty=1.2
                           )

# 결과 출력
response = tokenizer.decode(output[0], skip_special_tokens=False)
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
../aten/src/ATen/native/cuda/TensorCompare.cu:110: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
