### 라이브러리 설치

In [None]:
!pip install datasets
!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install transformers[torch] --upgrade
!pip install accelerate --upgrade
!pip install evaluate
!pip install rouge_score

###  데이터 경로 설정

In [15]:
image_dir = "./test10img"
train_caption_dir = "train_renew.json"
test_caption_dir = "test_renew.json"
val_caption_dir = "val_renew.json"

### 이미지 개수 확인

In [12]:
from PIL import Image
import os

# 폴더 내의 이미지 파일 목록 가져오기
image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]


# 이미지 파일의 확장자를 확인하고 갯수 counting
image_extensions = ['.jpg', '.jpeg', '.png']
image_count = sum(1 for f in image_files if any(f.endswith(ext) for ext in image_extensions))


print(f"폴더 '{image_dir}' 내에 {image_count} 개의 이미지 파일이 있습니다.")

폴더 './test10img' 내에 2551 개의 이미지 파일이 있습니다.


### 라이브러리 Import

In [3]:
import os
import datasets
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor,AutoTokenizer
os.environ["WANDB_DISABLED"] = "true"
import nltk
try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    nltk.download("punkt", quiet=True)

### VisionEncoderDecoderModel 초기화

In [4]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor

image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decode_model = "gpt2"

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    image_encoder_model, text_decode_model)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.1.crossattention.q_attn.bias', 'h.1.ln_cross_attn.weight', 'h.3.crossattention.c_attn.bias', 'h.8.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.bias', 'h.3.ln_cross_attn.bias', 'h.3.crossattention.q_attn.bias', 'h.4.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.6.ln_cross_attn.bias', 'h.4.ln_cross_attn.bias', 'h.10.crossattention.c_proj.weight', 'h.9.crossattention.q_attn.weight', 'h.6.crossattention.c_attn.weight', 'h.6.ln_cross_attn.weight', 'h.2.ln_cross_attn.bias', 'h.1.ln_cross_attn.bias', 'h.4.crossattention.c_attn.weight', 'h.8.ln_cross_attn.weight', 'h.8.ln_cross_attn.bias', 'h.5.crossattention.q_attn.bias', 'h.1.crossattention.c_proj.bias', 'h.4.crossattention.c_proj.weight', 'h.4.ln_cross_attn.weight', 'h.7.crossattention.c_attn.bias', 'h.11.crossattention.c_proj.bias', 'h.8.crossattention.c_proj.weight', 'h.11.crossattention.c

In [5]:
# 이미지 특성 추출기 설정
feature_extractor = AutoFeatureExtractor.from_pretrained(image_encoder_model)

# 텍스트 토크나이저 설정
tokenizer = AutoTokenizer.from_pretrained(text_decode_model)



In [6]:
# GPT-2 모델은 bos/eos 토큰만 가지고 있으며 decoder_start/pad 토큰은 가지고 있지 않음.
# 따라서 pad_token을 eos_token으로 설정.
tokenizer.pad_token = tokenizer.eos_token

# 모델 구성 업데이트: eos_token_id, decoder_start_token_id 및 pad_token_id를 토크나이저에서 가져온 값으로 설정.
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [7]:
output_dir = "vit-gpt-model"
model.save_pretrained(output_dir)
feature_extractor.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('vit-gpt-model\\tokenizer_config.json',
 'vit-gpt-model\\special_tokens_map.json',
 'vit-gpt-model\\vocab.json',
 'vit-gpt-model\\merges.txt',
 'vit-gpt-model\\added_tokens.json',
 'vit-gpt-model\\tokenizer.json')

### Dataset 불러오기

In [16]:
import datasets

# JSON 파일의 경로를 딕셔너리로 정의합니다.
data_files = {
    "train": train_caption_dir,
    "test": test_caption_dir,
    "validation": val_caption_dir
}

# 데이터셋을 로드합니다. 필드명을 'annotations'로 지정합니다.
ds = datasets.load_dataset("json", data_files=data_files, field='images')

# 데이터셋 구조를 확인합니다.
print(ds)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['width', 'image_id', 'coco_url', 'caption', 'file_name', 'caption_id', 'image_path', 'height'],
        num_rows: 1785
    })
    test: Dataset({
        features: ['width', 'image_id', 'coco_url', 'caption', 'file_name', 'caption_id', 'image_path', 'height'],
        num_rows: 384
    })
    validation: Dataset({
        features: ['width', 'image_id', 'coco_url', 'caption', 'file_name', 'caption_id', 'image_path', 'height'],
        num_rows: 382
    })
})


In [17]:
# 첫번째 요소 확인
ds['train'][0]

{'width': 600,
 'image_id': 2,
 'coco_url': 'https://example.com/images/glass2.jpg',
 'caption': 'glass.',
 'file_name': 'glass2.jpg',
 'caption_id': 2,
 'image_path': 'C:/Ye_Dong/AI_Team_project_final/Data/test10/test10img/glass2.jpg',
 'height': 800}

### 전처리(이미지, caption)

In [18]:
from PIL import Image

# 텍스트 전처리 단계
def tokenization_fn(captions, max_target_length):
    """캡션에 대한 토큰화를 실행."""
    # captions을 토크나이저를 사용하여 처리하고, max_target_length로 지정된 길이로 패딩.
    labels = tokenizer(captions,
                      padding="max_length",
                      max_length=max_target_length).input_ids

    return labels

# 이미지 전처리 단계
def feature_extraction_fn(image_paths, check_image=True):
    """
    이미지에 대한 특성 추출을 실행.
    `check_image`가 `True`인 경우 `Image.open()` 중에 오류가 발생하는 example은 잡히고 제거.
    그렇지 않으면 예외가 발생.
    """

    model_inputs = {}

    if check_image:
        images = []
        to_keep = []
        for image_file in image_paths:
            try:
                img = Image.open(image_file)
                images.append(img)
                to_keep.append(True)
            except Exception:
                to_keep.append(False)
    else:
        images = [Image.open(image_file) for image_file in image_paths]

    encoder_inputs = feature_extractor(images=images, return_tensors="np")

    return encoder_inputs.pixel_values

def preprocess_fn(examples, max_target_length, check_image=True):
    """토큰화 및 이미지 특성 추출 실행"""
    image_paths = examples['image_path']
    captions = examples['caption']

    model_inputs = {}
    # 이 부분은 image_paths 열을 포함
    model_inputs['labels'] = tokenization_fn(captions, max_target_length)
    model_inputs['pixel_values'] = feature_extraction_fn(image_paths, check_image=check_image)

    return model_inputs


In [19]:
processed_dataset = ds.map(
    function=preprocess_fn,
    batched=True,
    fn_kwargs={"max_target_length": 500},
    remove_columns=ds['train'].column_names
)

Map:   0%|          | 0/1785 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

Map:   0%|          | 0/382 [00:00<?, ? examples/s]

In [20]:
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'pixel_values'],
        num_rows: 1785
    })
    test: Dataset({
        features: ['labels', 'pixel_values'],
        num_rows: 384
    })
    validation: Dataset({
        features: ['labels', 'pixel_values'],
        num_rows: 382
    })
})

### seq2seq train 인수 정의

In [21]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir="./image-captioning-output",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


### 학습 요건 설정 및 학습

In [22]:
import evaluate
metric = evaluate.load("rouge")

In [23]:
import numpy as np

# 손실 계산 시 패드 토큰 무시 설정
ignore_pad_token_for_loss = True

def postprocess_text(preds, labels):
    # 예측값 및 실제값에서 불필요한 공백 제거
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum은 각 문장 뒤에 새 줄 문자를 예상.
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # 토큰화된 예측값과 실제값을 디코딩하여 읽을 수 있는 텍스트로 변환
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    if ignore_pad_token_for_loss:
        # -100인 레이블을 변환하여 디코드할 수 없으므로 패드 토큰으로 대체
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 일부 간단한 후처리
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # metric 계산
    result = metric.compute(predictions=decoded_preds,
                            references=decoded_labels,
                            use_stemmer=True)

    # 결과를 백분율로 변환하고 소수점 네 자리까지 반올림
    result = {k: round(v * 100, 4) for k, v in result.items()}

    # 생성된 텍스트의 평균 길이 계산
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return result


In [24]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=processed_dataset['train'],
    eval_dataset=processed_dataset['validation'],
    data_collator=default_data_collator,
)

In [25]:
trainer.train()

  0%|          | 0/1341 [00:00<?, ?it/s]



  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.0025906322989612818, 'eval_rouge1': 84.3794, 'eval_rouge2': 0.3348, 'eval_rougeL': 84.4917, 'eval_rougeLsum': 84.5201, 'eval_gen_len': 2.424083769633508, 'eval_runtime': 474.8085, 'eval_samples_per_second': 0.805, 'eval_steps_per_second': 0.202, 'epoch': 1.0}
{'loss': 0.0356, 'learning_rate': 3.135719612229679e-05, 'epoch': 1.12}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.002072723815217614, 'eval_rouge1': 90.204, 'eval_rouge2': 0.2805, 'eval_rougeL': 90.2381, 'eval_rougeLsum': 90.2389, 'eval_gen_len': 2.426701570680628, 'eval_runtime': 472.1101, 'eval_samples_per_second': 0.809, 'eval_steps_per_second': 0.203, 'epoch': 2.0}
{'loss': 0.0032, 'learning_rate': 1.2714392244593587e-05, 'epoch': 2.24}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.00198806868866086, 'eval_rouge1': 88.9274, 'eval_rouge2': 0.2805, 'eval_rougeL': 88.938, 'eval_rougeLsum': 88.9409, 'eval_gen_len': 2.418848167539267, 'eval_runtime': 472.8306, 'eval_samples_per_second': 0.808, 'eval_steps_per_second': 0.203, 'epoch': 3.0}
{'train_runtime': 16516.7067, 'train_samples_per_second': 0.324, 'train_steps_per_second': 0.081, 'train_loss': 0.01490330504979952, 'epoch': 3.0}


TrainOutput(global_step=1341, training_loss=0.01490330504979952, metrics={'train_runtime': 16516.7067, 'train_samples_per_second': 0.324, 'train_steps_per_second': 0.081, 'train_loss': 0.01490330504979952, 'epoch': 3.0})

### Model 저장

In [26]:
trainer.save_model("./image-captioning-output")

In [27]:
tokenizer.save_pretrained("./image-captioning-output")

('./image-captioning-output\\tokenizer_config.json',
 './image-captioning-output\\special_tokens_map.json',
 './image-captioning-output\\vocab.json',
 './image-captioning-output\\merges.txt',
 './image-captioning-output\\added_tokens.json',
 './image-captioning-output\\tokenizer.json')

### Captioning Test

In [30]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor

# 저장된 모델 디렉토리 경로
model_dir = "./image-captioning-output"

# 모델 불러오기
model = VisionEncoderDecoderModel.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

from PIL import Image

# 이미지 경로
image_path = "C:/Ye_Dong/AI_Team_project_final/Data/test10/bag-8152319_1920.jpg"

# 이미지 로드
image = Image.open(image_path)

# 이미지 전처리
encoder_inputs = feature_extractor(images=[image], return_tensors="pt")

# 이미지 캡션 생성
output = model.generate(**encoder_inputs, max_length=50, num_beams=4, no_repeat_ngram_size=2, top_k=50, top_p=0.95)

# 생성된 텍스트 디코딩
generated_caption = tokenizer.decode(output[0], skip_special_tokens=True)

# 생성된 캡션 출력
print("Generated Caption:", generated_caption)


Generated Caption: etc.
