In [2]:
import torch
import copy
from transformers import AutoTokenizer

# LLaVA 프로젝트 내부 모듈 Import
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, IGNORE_INDEX
from llava.conversation import conv_templates, SeparatorStyle
from llava.mm_utils import tokenizer_image_token

# -------------------------------------------------------------------------
# 1. 샘플 데이터 (Instruction Tuning Data)
# -------------------------------------------------------------------------
sample_data = {
    "id": "000000334872",
    "image": "coco/train2017/000000334872.jpg",
    "conversations": [
      {
        "from": "human",
        "value": "<image>\nAre the people in the image skiing downhill or cross-country skiing?"
      },
      {
        "from": "gpt",
        "value": "The people in the image are cross-country skiing in the woods, as they are skiing on a trail rather than a steep slope."
      },
      {
        "from": "human",
        "value": "How many people are in the image?"
      },
      {
        "from": "gpt",
        "value": "There are two people in the image, both on skis in the snow."
      },
      {
        "from": "human",
        "value": "What kind of environment are they skiing in?"
      },
      {
        "from": "gpt",
        "value": "They are skiing in a wooded environment, following a trail through the trees while surrounded by snow."
      },
      {
        "from": "human",
        "value": "Do the skiers have any additional gear with them besides their skis and poles?"
      },
      {
        "from": "gpt",
        "value": "Yes, the two male skiers are carrying backpacks while they ski through the woods. The backpacks might contain essentials for their skiing adventure, such as food, water, extra clothing, or safety equipment."
      }
    ]
  }


In [5]:

# ---------------------------------------------------------------------
# 2. 모델 및 토크나이저 경로 설정
# ---------------------------------------------------------------------
# 학습된 체크포인트 경로 사용
model_path = "checkpoints/llava-v1.5-7b-50pct-finetune"

print(f"Loading tokenizer from: {model_path}")

try:
    tokenizer = AutoTokenizer.from_pretrained(
        model_path,
        model_max_length=2048,
        padding_side="right",
        use_fast=False
    )
except OSError:
    print("Warning: 체크포인트에 토크나이저 파일이 없어서 기본 Vicuna 토크나이저를 로드합니다.")
    tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5", use_fast=False)

# LLaVA 특수 토큰 처리 (이미지 토큰 등)
tokenizer.add_tokens([DEFAULT_IMAGE_TOKEN], special_tokens=True)


# ---------------------------------------------------------------------
# [Step 1] 토크나이즈 직전: 프롬프트 구성 (Prompt Construction)
# ---------------------------------------------------------------------
# LLaVA-1.5는 'vicuna_v1' 템플릿 사용
conv = conv_templates["vicuna_v1"].copy()
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

# 대화 내용 구성
source = sample_data["conversations"]

# 첫 대화가 Human이 아니면 스킵 (LLaVA train.py 로직)
if roles[source[0]["from"]] != conv.roles[0]:
    source = source[1:]

conv.messages = []
for j, sentence in enumerate(source):
    role = roles[sentence["from"]]
    conv.append_message(role, sentence["value"])

# 최종 프롬프트 문자열 생성
full_prompt = conv.get_prompt()

print("\n" + "="*80)
print(" [Step 1] 토크나이즈 직전 (Human-Readable Prompt)")
print("="*80)
print(full_prompt)
print("="*80)


# ---------------------------------------------------------------------
# [Step 2] 임베딩 직전: Input IDs & Labels (Masking 적용)
# ---------------------------------------------------------------------
# LLaVA의 train.py -> preprocess_v1() 함수 로직을 단순화하여 구현

# 1. Input IDs 생성 
#    (이미지 텍스트 <image>는 IMAGE_TOKEN_INDEX (-200)로 변환됨)
input_ids = tokenizer_image_token(full_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')

# 2. Labels 생성 (Input IDs 복제 후 User 부분 마스킹)
targets = input_ids.clone()

# 3. Masking 로직 (User 발화 부분 -100 처리)
sep = conv.sep + conv.roles[1] + ": " # " ASSISTANT: "

# 전체 대화를 턴 단위로 분리 (Vicuna는 </s>로 구분됨)
rounds = full_prompt.split(conv.sep2) 

cur_len = 1 # <s> (BOS) 토큰 길이
targets[:cur_len] = IGNORE_INDEX # 시작 토큰 마스킹

for i, rou in enumerate(rounds):
    if rou == "": break

    parts = rou.split(sep)
    if len(parts) != 2: break
    
    # parts[0]: "USER: <image>\nQuestion"
    # parts[1]: "Answer"
    
    # 구분자 복구 ("USER: ... ASSISTANT: " 까지가 Instruction)
    parts[0] += sep 

    # Instruction 길이 계산 (토크나이징해서 길이 측정)
    instruction_ids = tokenizer_image_token(parts[0], tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
    
    # <s> 토큰(1개) 제외하고 길이 계산
    instruction_len = len(instruction_ids) - 1 

    # [마스킹 핵심] 
    # 현재 위치(cur_len) 부터 Instruction이 끝나는 곳까지 -100으로 덮어씀
    targets[cur_len : cur_len + instruction_len] = IGNORE_INDEX

    # 다음 라운드 시작 위치 계산
    round_ids = tokenizer_image_token(rou, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
    cur_len += len(round_ids) - 1 # <s> 제외
    
    # EOS (</s>) 토큰 처리
    cur_len += 1 

print("\n" + "="*80)
print(" [Step 2] 임베딩 레이어 진입 직전 (Input IDs & Labels)")
print("="*80)

print(f"Total Sequence Length: {len(input_ids)}")
print("-" * 40)

# 앞부분 60개 토큰만 출력해서 확인
print(f"1. Input IDs (First 60 tokens):\n{input_ids[:60].tolist()}")

# -200 확인
has_image = IMAGE_TOKEN_INDEX in input_ids
print(f"\n   -> 이미지 토큰(-200) 존재 여부: {has_image}")
if has_image:
    print("   (참고: 모델 내부에서 이 -200 자리가 576개의 Visual Embedding으로 교체됩니다.)")

print("-" * 40)
print(f"2. Labels (First 60 tokens) [-100은 Loss 계산 제외]:\n{targets[:60].tolist()}")

# ---------------------------------------------------------------------
# 검증: 실제 모델이 학습하는 내용 복원
# ---------------------------------------------------------------------
print("-" * 40)
print("[검증] Labels에서 -100을 제외하고 복원한 텍스트 (모델이 맞춰야 할 정답):")

valid_tokens = targets[targets != IGNORE_INDEX]
decoded_truth = tokenizer.decode(valid_tokens, skip_special_tokens=False)

print("\n" + decoded_truth)
print("-" * 40)
print("-> 위 텍스트에 '질문'은 없고 '답변'과 '</s>'만 보이면 정상입니다.")
print("="*80)


Loading tokenizer from: checkpoints/llava-v1.5-7b-50pct-finetune

 [Step 1] 토크나이즈 직전 (Human-Readable Prompt)
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
Are the people in the image skiing downhill or cross-country skiing? ASSISTANT: The people in the image are cross-country skiing in the woods, as they are skiing on a trail rather than a steep slope.</s>USER: How many people are in the image? ASSISTANT: There are two people in the image, both on skis in the snow.</s>USER: What kind of environment are they skiing in? ASSISTANT: They are skiing in a wooded environment, following a trail through the trees while surrounded by snow.</s>USER: Do the skiers have any additional gear with them besides their skis and poles? ASSISTANT: Yes, the two male skiers are carrying backpacks while they ski through the woods. The backpacks might contain essentials for their skiing ad

In [7]:
print(full_prompt)

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
Are the people in the image skiing downhill or cross-country skiing? ASSISTANT: The people in the image are cross-country skiing in the woods, as they are skiing on a trail rather than a steep slope.</s>USER: How many people are in the image? ASSISTANT: There are two people in the image, both on skis in the snow.</s>USER: What kind of environment are they skiing in? ASSISTANT: They are skiing in a wooded environment, following a trail through the trees while surrounded by snow.</s>USER: Do the skiers have any additional gear with them besides their skis and poles? ASSISTANT: Yes, the two male skiers are carrying backpacks while they ski through the woods. The backpacks might contain essentials for their skiing adventure, such as food, water, extra clothing, or safety equipment.</s>


In [None]:
#!/usr/bin/env python3
"""
LLaVA 전처리 과정 확인 스크립트
1. 토크나이즈 직전 텍스트 형태
2. 임베딩 레이어 직전 input_ids 형태
"""

import sys
import torch
from pathlib import Path
from PIL import Image
import json

# LLaVA 경로 추가
sys.path.insert(0, '/nas/home/ongv1109/LLaVA1.5')

from llava.model import LlavaLlamaForCausalLM
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates
from llava.mm_utils import tokenizer_image_token, process_images
from transformers import AutoTokenizer

# ================== 설정 ==================
MODEL_PATH = "/nas/home/ongv1109/LLaVA1.5/checkpoints/llava-v1.5-7b-50pct-finetune"
IMAGE_FOLDER = "/nas/datahub/llava-v1.5-instruct"

# 샘플 데이터
SAMPLE = {
    "id": "000000334872",
    "image": "coco/train2017/000000334872.jpg",
    "conversations": [
        {
            "from": "human",
            "value": "<image>\nAre the people in the image skiing downhill or cross-country skiing?"
        },
        {
            "from": "gpt",
            "value": "The people in the image are cross-country skiing in the woods, as they are skiing on a trail rather than a steep slope."
        },
        {
            "from": "human",
            "value": "How many people are in the image?"
        },
        {
            "from": "gpt",
            "value": "There are two people in the image, both on skis in the snow."
        },
        {
            "from": "human",
            "value": "What kind of environment are they skiing in?"
        },
        {
            "from": "gpt",
            "value": "They are skiing in a wooded environment, following a trail through the trees while surrounded by snow."
        },
        {
            "from": "human",
            "value": "Do the skiers have any additional gear with them besides their skis and poles?"
        },
        {
            "from": "gpt",
            "value": "Yes, the two male skiers are carrying backpacks while they ski through the woods. The backpacks might contain essentials for their skiing adventure, such as food, water, extra clothing, or safety equipment."
        }
    ]
}

def print_section(title):
    """섹션 구분선 출력"""
    print("\n" + "="*80)
    print(f" {title}")
    print("="*80 + "\n")

def load_model_and_tokenizer():
    """모델과 토크나이저 로드"""
    print("Loading model and tokenizer...")
    
    # 토크나이저 로드
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_PATH,
        model_max_length=2048,
        padding_side="right",
        use_fast=False,
    )
    
    # 모델 로드 (CPU로, 추론용)
    model = LlavaLlamaForCausalLM.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch.float16,
        device_map="cpu"
    )
    
    return model, tokenizer

def preprocess_multimodal(conversations):
    """
    LLaVA의 멀티모달 전처리
    <image> 토큰을 DEFAULT_IMAGE_TOKEN으로 교체
    """
    for sentence in conversations:
        if DEFAULT_IMAGE_TOKEN in sentence['value']:
            sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
            sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
            sentence['value'] = sentence['value'].strip()
    return conversations

def preprocess_llama2(conversations, tokenizer):
    """
    LLaMA-2 스타일 대화 형식으로 변환
    [INST] ... [/INST] 형식
    """
    conv = conv_templates["vicuna_v1"].copy()
    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
    
    # 대화 추가
    for j, sentence in enumerate(conversations):
        role = roles[sentence["from"]]
        conv.append_message(role, sentence["value"])
    
    # 프롬프트 생성
    prompt = conv.get_prompt()
    
    return prompt

def main():
    print_section("LLaVA 전처리 과정 확인")
    
    # 모델 로드
    model, tokenizer = load_model_and_tokenizer()
    
    # ================== STEP 1: 원본 대화 ==================
    print_section("STEP 1: 원본 대화 (JSON)")
    print(json.dumps(SAMPLE['conversations'], indent=2, ensure_ascii=False))
    
    # ================== STEP 2: 멀티모달 전처리 ==================
    print_section("STEP 2: 멀티모달 전처리 후")
    conversations = preprocess_multimodal(SAMPLE['conversations'].copy())
    
    for i, conv in enumerate(conversations):
        print(f"[{i}] {conv['from']}: {repr(conv['value'][:100])}...")
    
    # ================== STEP 3: 대화 템플릿 적용 (토크나이즈 직전) ==================
    print_section("STEP 3: 토크나이즈 직전 텍스트")
    
    prompt = preprocess_llama2(conversations, tokenizer)
    
    print("=" * 80)
    print("전체 프롬프트:")
    print("=" * 80)
    print(prompt)
    print("=" * 80)
    print(f"길이: {len(prompt)} 문자")
    print(f"<image> 토큰 위치: {prompt.find(DEFAULT_IMAGE_TOKEN)}")
    
    # ================== STEP 4: 토크나이즈 후 (임베딩 직전) ==================
    print_section("STEP 4: 토크나이즈 후 input_ids (임베딩 레이어 직전)")
    
    # 토크나이즈
    input_ids = tokenizer_image_token(
        prompt, 
        tokenizer, 
        IMAGE_TOKEN_INDEX, 
        return_tensors='pt'
    )
    
    print(f"input_ids shape: {input_ids.shape}")
    print(f"input_ids dtype: {input_ids.dtype}")
    print(f"\ninput_ids (처음 50개):")
    print(input_ids[:50])
    print(f"\ninput_ids (전체):")
    print(input_ids)
    
    # IMAGE_TOKEN_INDEX 위치 찾기
    image_token_positions = (input_ids == IMAGE_TOKEN_INDEX).nonzero(as_tuple=True)[0]
    print(f"\nIMAGE_TOKEN_INDEX ({IMAGE_TOKEN_INDEX}) 위치:")
    print(image_token_positions)
    
    # ================== STEP 5: 토큰 ID → 텍스트 매핑 ==================
    print_section("STEP 5: 토큰 ID와 텍스트 매핑 (샘플)")
    
    print("처음 20개 토큰:")
    for i in range(min(20, len(input_ids))):
        token_id = input_ids[i].item()
        if token_id == IMAGE_TOKEN_INDEX:
            token_text = "<IMAGE_TOKEN>"
        else:
            token_text = tokenizer.decode([token_id])
        print(f"  [{i:3d}] ID={token_id:6d} → {repr(token_text)}")
    
    # ================== STEP 6: 이미지 전처리 ==================
    print_section("STEP 6: 이미지 전처리")
    
    image_path = Path(IMAGE_FOLDER) / SAMPLE['image']
    if image_path.exists():
        print(f"이미지 경로: {image_path}")
        
        # 이미지 로드
        image = Image.open(image_path).convert('RGB')
        print(f"원본 이미지 크기: {image.size}")
        
        # CLIP 전처리 (LLaVA의 process_images 사용)
        from llava.mm_utils import get_model_name_from_path
        from llava.model.builder import load_pretrained_model
        
        # Vision tower의 image processor 가져오기
        image_processor = model.get_vision_tower().image_processor
        
        # 이미지 전처리
        image_tensor = process_images([image], image_processor, model.config)
        
        print(f"전처리 후 이미지 텐서 shape: {image_tensor.shape}")
        print(f"전처리 후 이미지 텐서 dtype: {image_tensor.dtype}")
        print(f"전처리 후 이미지 텐서 범위: [{image_tensor.min():.3f}, {image_tensor.max():.3f}]")
    else:
        print(f"⚠️  이미지를 찾을 수 없습니다: {image_path}")
    
    # ================== STEP 7: 요약 ==================
    print_section("요약")
    
    print(f"✓ 원본 대화 턴 수: {len(SAMPLE['conversations'])}")
    print(f"✓ 토크나이즈 전 텍스트 길이: {len(prompt)} 문자")
    print(f"✓ 토크나이즈 후 시퀀스 길이: {len(input_ids)} 토큰")
    print(f"✓ IMAGE_TOKEN 개수: {len(image_token_positions)}")
    print(f"✓ IMAGE_TOKEN_INDEX 값: {IMAGE_TOKEN_INDEX}")
    
    if image_path.exists():
        print(f"✓ 이미지 전처리 완료: {image_tensor.shape}")
    
    print("\n" + "="*80)
    print("전처리 과정 확인 완료!")
    print("="*80)

if __name__ == "__main__":
    main()