In [None]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
from accelerate import infer_auto_device_map

LLAMA_PATH = 'model/dpo-240820-ep-line-merged'

# 4bit 퀀타이제이션 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4bit로 로드
    bnb_4bit_compute_dtype=torch.float16,  # 계산은 float16으로
    bnb_4bit_use_double_quant=True,  # 더블 퀀타이제이션 사용 (메모리 최적화)
    llm_int8_enable_fp32_cpu_offload = True
)

from accelerate import Accelerator

# Accelerator 객체 생성
accelerator = Accelerator()

# 각 GPU에 최대 메모리 설정 (예시로 12GB씩 할당)
max_memory = {
    0: "23GB",  # GPU 0
    1: "23GB",  # GPU 1
    2: "23GB",  # GPU 2
    3: "23GB",  # GPU 3
    4: "23GB",  # GPU 4
    5: "23GB",  # GPU 5
    6: "23GB",  # GPU 6
    7: "23GB",  # GPU 7
}

# 모델 로딩 시 `max_memory`와 함께 `device_map`을 설정
model = LlamaForCausalLM.from_pretrained(
    LLAMA_PATH,
    quantization_config=bnb_config,
    device_map="cpu",  # GPU 자동 분배
    max_memory=max_memory,  # 각 GPU에 할당할 메모리 크기
)

# Accelerator로 모델 준비
model = accelerator.prepare(model)

# 모델 로드 완료 후 tokenizer 불러오기
tokenizer = LlamaTokenizer.from_pretrained(LLAMA_PATH)



  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards:  49%|██████████████████████████████████▌                                    | 93/191 [08:31<10:03,  6.15s/it]

In [None]:

# Chat 형식 입력
chat_history = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello! How are you?"}
]

# Chat 형식을 모델 입력으로 변환하는 함수
def format_chat_input(chat_history):
    formatted_text = ""
    for turn in chat_history:
        if turn["role"] == "system":
            formatted_text += f"<s>[INST] <<SYS>>\n{turn['content']}\n<</SYS>>\n[/INST]"
        elif turn["role"] == "user":
            formatted_text += f"<s>[INST] {turn['content']} [/INST]"
    return formatted_text

# 입력 변환 및 토큰화
input_text = format_chat_input(chat_history)
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# 모델 추론
with torch.no_grad():
    output = model.generate(**inputs, max_length=200)

# 결과 출력
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)


In [10]:
SYSTEM_PROMPT = {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": """You're an expert translator who translates Korean webtoon in English. Make sure the number of target sentences matches the number of source sentences. The result should be TSV formatted. 
            • Find a balance between staying true to the Korean meaning and keeping a natural flow. Don't be afraid to add to the text. Embellish it. 
            • Avoid translating word-for-word. Keep the general feeling and translate the text accordingly. 
            • Translate with an American audience in mind. This means easy-to-read, conversational English.""",
                }
            ],
        }

In [11]:
chat_completion = llama_client.beta.chat.completions.parse(
    model= LLAMA_FINE_TUNING_MODEL,
    messages = [
        SYSTEM_PROMPT,
        {
            "role":"user",
            "content" : [{"type" : "text",
                          "text" : 
                          """### glossary
                            • 유니스 (F): eunice
                            • 시그렌 (M): siegren
                            • 용병: mercenaries
                            • 권터 (M): gunter
                            
                            ### source
                            001	[피오나:F:독백] 남의 남자 탐내지 마세요.
                            002	[피오나:F:독백] 이건 지금 내 거거든.
                            003	[헤더 아델:F:독백] 돌아가겠어요!
                            004	[귀족영애들:nan:독백] 자, 잠깐!
                            005	[귀족영애들:nan:독백] 헤더 영애!
                            006	[귀족영애들:nan:독백] 기다려요!
                            007	[피오나:F:독백] 도망갔다.
                            008	[피오나:F:독백] 이 정도에 꼬리를 말 거면 왜 시비를 거는지 몰라.
                            009	[피오나:F:독백] 그렇지, 시그…
                            010	[피오나:F:독백] …엇."""
                        }],
        }
    ],
    temperature= 0.2,
    top_p = 0.8,
    max_tokens=4096
)

PermissionDeniedError: <!doctype html><meta charset="utf-8"><meta name=viewport content="width=device-width, initial-scale=1"><title>403</title>403 Forbidden

In [31]:
print(chat_completion.choices[0].message.content)

001	don’t you dare covet someone else’s man.
002	he’s mine right now.
003	I’m going to lose my mind!
004	w-wait!
005	lady heather!
006	wait for us!
007	they ran away.
008	why did they even try to pick a fight if they were going to run away so easily?
009	right, sieg--
010	oh...


In [32]:
response = chat_completion.choices[0].message.content

In [33]:
print(response)

001	don’t you dare covet someone else’s man.
002	he’s mine right now.
003	I’m going to lose my mind!
004	w-wait!
005	lady heather!
006	wait for us!
007	they ran away.
008	why did they even try to pick a fight if they were going to run away so easily?
009	right, sieg--
010	oh...
