In [7]:
# 경로 및 파일 이름 설정
annotation_dir = "../img_test_08/"

image_dir = "../img_test_08/AI_Result_img"
output_dir = "../img_test_08/Reshaped_img"

test_image_name_dir = "../img_test_08/test_image_list.txt"
train_image_name_dir = "../img_test_08/train_image_list.txt"
caption_file = "../img_test_08/test07caption.txt"

### 이미지 갯수 확인하기

In [3]:
from PIL import Image
import os

In [4]:
# 폴더 내의 이미지 파일 목록 가져오기
image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]

# 이미지 파일의 확장자를 확인하고 갯수 counting
image_extensions = ['.jpg', '.jpeg', '.png']
image_count = sum(1 for f in image_files if any(f.endswith(ext) for ext in image_extensions))

print(f"폴더 '{image_dir}' 내에 {image_count} 개의 이미지 파일이 있습니다.")


폴더 '../img_test_08/AI_Result_img' 내에 152 개의 이미지 파일이 있습니다.


### 이미지 처리

In [9]:
import cv2
import numpy as np

In [10]:
# 배경 제거 함수 정의 (예: GrabCut 알고리즘 사용)
def remove_background(input_image_path, output_image_path):
    image = cv2.imread(input_image_path)
    
    # 초기 마스크 생성
    mask = np.zeros(image.shape[:2], np.uint8)

    # 객체를 둘러싸는 직사각형 경계 상자 정의 (예: 이미지 전체)
    rect = (0, 0, image.shape[1], image.shape[0])

    # GrabCut 알고리즘 적용
    cv2.grabCut(image, mask, rect, None, None, 5, cv2.GC_INIT_WITH_RECT)

    # 결과 마스크 생성
    mask2 = np.where((mask == 2) | (mask == 0), 0, 1).astype('uint8')

    # 배경 제거된 이미지 생성
    result_image = image * mask2[:, :, np.newaxis]
    
    cv2.imwrite(output_image_path, result_image)

# 노이즈 제거 함수 정의 (예: 미디언 필터 사용)
def remove_noise(input_image_path, output_image_path):
    image = cv2.imread(input_image_path)
    
    # 미디언 필터 적용
    result_image = cv2.medianBlur(image, 5)  # 필터 크기 조정 가능

    cv2.imwrite(output_image_path, result_image)

# 이미지 파일 이름을 가져오는 함수 정의
def get_image_names(img_dir):
    image_names = []
    for filename in os.listdir(img_dir):
        if filename.endswith(".png"):
            image_names.append(filename)
    return image_names

# 이미지 파일 이름 리스트 가져오기
image_names = get_image_names(image_dir)

# 이미지 전처리 및 저장
for image_name in image_names:
    input_image_path = os.path.join(image_dir, image_name)
    output_image_path = os.path.join(output_dir, image_name)

    # 배경 제거 및 노이즈 제거 적용
    remove_background(input_image_path, "temp_image.png")
    remove_noise("temp_image.png", output_image_path)

    # 임시 이미지 파일 삭제
    os.remove("temp_image.png")

error: OpenCV(4.8.0) D:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\grabcut.cpp:386: error: (-215:Assertion failed) !bgdSamples.empty() && !fgdSamples.empty() in function 'initGMMs'


### Caption 분류

In [4]:
import os
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# 이미지 파일 리스트와 캡션 리스트 생성
image_list = sorted(os.listdir(image_dir))
captions = []

with open(caption_file, "r") as file:
    for line in file:
        image_name, caption = line.strip().split("\t")[0], line.strip().split("\t")[1]
        captions.append(caption)

# Tokenizer를 사용하여 캡션 텍스트 토큰화
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)

# 텍스트 시퀀스를 정수 시퀀스로 변환
sequences = tokenizer.texts_to_sequences(captions)

# 시퀀스 패딩 (모든 시퀀스를 동일한 길이로 맞춤)
max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# 이미지 파일 이름과 정수 시퀀스를 매칭
image_caption_map = dict(zip(image_list, sequences))

# train, test 데이터 분리
random.shuffle(image_list)
split_ratio = 0.8
split_index = int(split_ratio * len(image_list))
train_image_names = image_list[:split_index]
test_image_names = image_list[split_index:]

# train, test 데이터에 해당하는 캡션 리스트 생성
train_captions = [image_caption_map[image_name] for image_name in train_image_names]
test_captions = [image_caption_map[image_name] for image_name in test_image_names]

# 이미지 파일 이름을 텍스트 파일로 저장하는 함수 정의
def save_image_list(image_names, output_file):
    with open(output_file, 'w') as file:
        for image_name in image_names:
            file.write(image_name + '\n')

# 이미지 파일 이름을 각각의 텍스트 파일에 저장
save_image_list(train_image_names, train_image_name_dir)
save_image_list(test_image_names, test_image_name_dir)

In [6]:
def read_file(file_name):
    with open(os.path.join(annotation_dir, file_name), 'rb') as file_handle:
        file_lines = file_handle.read().splitlines()
    return file_lines

In [7]:
# train, test 이미지 이름 불러오기
train_image_paths = read_file('train_image_list.txt')
test_image_paths = read_file('test_image_list.txt')
captions = read_file('test07caption.txt')

In [8]:
print(len(train_image_paths))
print(len(test_image_paths))
print(len(captions))

121
31
760


### Vocabulary

In [10]:
def up_get_vocab():
    image_caption_map = {}
    unique_words = set()

    max_words = 0

    for caption in captions:
        caption = caption.decode("utf-8")
        image_name = caption.split('#')[0]
        image_caption = caption.split('#')[1].split('\t')[1]

        if image_name not in image_caption_map:
            image_caption_map[image_name] = [image_caption]
        else:
            image_caption_map[image_name].append(image_caption)

        caption_words = image_caption.split()
        max_words = max(max_words, len(caption_words))
        unique_words.update(caption_words)

    unique_words = list(unique_words)
    word_to_index_map = {word: index for index, word in enumerate(unique_words)}
    index_to_word_map = {index: word for index, word in enumerate(unique_words)}

    return image_caption_map, max_words, unique_words, word_to_index_map, index_to_word_map


### ImageModel

In [11]:
from keras.applications.vgg16 import VGG16
from keras.models import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np
from tensorflow.keras.applications.vgg16 import preprocess_input
import pickle

In [12]:
class ImageModel:
    def __init__(self):
        vgg_model = VGG16(weights='imagenet', include_top=True)
        self.model = Model(inputs=vgg_model.input,
                           outputs=vgg_model.get_layer('fc2').output)

    # 이미지 읽어 들여 전처리하는 매서드
    @staticmethod
    def load_preprocess_image(image_path):
        image_array = load_img(image_path, target_size=(224, 224))
        image_array = img_to_array(image_array)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = preprocess_input(image_array)
        return image_array

    # 이미지를 로딩하여 예측하는 매서드
    def extract_feature_from_image_path(self, image_path):
        image_array = self.load_preprocess_image(image_path)
        features = self.model.predict(image_array)
        return features.reshape((4096, 1))

    # 이미지 경로를 포함한 리스트를 따라가며 특징 리스트 생성 매서드
    def extract_feature_from_image_paths(self, work_dir, image_names):
        features = []
        for image_name in image_names:
            image_path = os.path.join(work_dir, image_name)
            feature = self.extract_feature_from_image_path(image_path)
            features.append(feature)
        return features

    # 추출한 특징을 pickle 파일로 저장 매서드
    def extract_features_and_save(self, work_dir, image_names, file_name):
        features = self.extract_feature_from_image_paths(work_dir, image_names)
        with open(file_name, 'wb') as p:
            pickle.dump(features, p)


In [27]:
# trian, test 이미지 특징 추출
I = ImageModel()

I.extract_features_and_save(b'../img_test_08/AI_Result_img',train_image_paths, 'train_image_features.p')
I.extract_features_and_save(b'../img_test_08/AI_Result_img',test_image_paths, 'test_image_features.p')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [13]:
with open('../img_test_08/train_image_features.p', 'rb') as p:
      train_ds = pickle.load(p)
with open('../img_test_08/test_image_features.p', 'rb') as p:
      test_ds = pickle.load(p)

In [14]:
print(train_ds)
print('----------------------------------------------------')
print(test_ds)

[array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32), array([[0.       ],
       [0.       ],
       [0.       ],
       ...,
       [0.       ],
       [0.       ],
       [0.6050958]], dtype=float32), array([[0.        ],
       [0.        ],
       [0.34871292],
       ...,
       [0.        ],
       [1.0886719 ],
       [0.381657  ]], dtype=float32), array([[0.       ],
       [0.       ],
       [0.       ],
       ...,
       [0.       ],
       [0.5233148],
       [1.901562 ]], dtype=float32), array([[0.       ],
       [0.       ],
       [0.       ],
       ...,
       [0.       ],
       [0.8247345],
       [1.0853554]], dtype=float32), array([[0.        ],
       [1.3335574 ],
       [0.        ],
       ...,
       [0.        ],
       [0.5466149 ],
       [0.15384722]], dtype=float32), array([[0.       ],
       [0.       ],
       [0.       ],
       ...,
       [0.       ],
       [0.9094123],
       [1.7897921]], dt

In [15]:
# image_caption_map, max_words, unique_words, word_to_index_map, index_to_word_map = get_vocab()
image_caption_map, max_words, unique_words, word_to_index_map, index_to_word_map = up_get_vocab()
vocabulary_size = len(unique_words)

In [16]:
train_descriptions={}

image_names = list(train_image_paths)
test_image_names = list(test_image_paths)

for i in range(0, len(train_image_paths)):
  image_name = image_names[i].decode('utf-8')
  caption_list = image_caption_map[image_name]
  desc=[]
  for caption in caption_list:
    desc.append(f'startseq {caption} endseq')
  train_descriptions[image_name] = desc
  print(desc)

['startseq The absence of objects is conspicuous against the bright illumination that highlights the black background. endseq', 'startseq Dim lighting exposes the presence of a polluted white plastic background. endseq', 'startseq In the low light setting, there is a tainted white plastic surface. endseq', 'startseq A contaminated white plastic backdrop is visible in the dim light. endseq', 'startseq Under subdued lighting, a white plastic background is tainted. endseq']
['startseq A paper with various colors printed on it is tied with a yellow rubber band on the white background. endseq', 'startseq Against the backdrop of white, a paper featuring a spectrum of printed colors is secured in place by a yellow rubber band. endseq', "startseq Resting on the pristine white surface, you'll find a paper adorned with an array of colors, held together using a yellow rubber band. endseq", 'startseq The white background serves as the canvas for a paper with diverse printed colors, fastened with a

In [17]:
from tensorflow.keras.utils import to_categorical

In [18]:
def data_generator(descriptions,image_paths, images, word_to_index, max_length, num_images_per_batch ):
  x1, x2, y = [], [], []
  n=0
  image_names = list(image_paths)
  np.random.shuffle(image_names)
  while True:
    for j in range(9):
      n+=1
      image_name = image_names[j].decode('utf-8')
      caption_list = descriptions[image_name]
      image = images[image_paths.index(b''+image_name.encode())]

      for desc in caption_list:
        seq = [word_to_index[word] for word in desc.split(' ') if word in word_to_index]
        for i in range(1, len(seq)):
          in_seq, out_seq = seq[:i], seq[i]
          in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
          out_seq = to_categorical([out_seq], num_classes=vocabulary_size)[0]
          x1.append(image)
          x2.append(in_seq)
          y.append(out_seq)
      if n == num_images_per_batch:
        yield ([np.array(x1), np.array(x2)], np.array(y))
        x1, x2, y = [],[],[]
        n=0


In [20]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector, Concatenate, Activation, Flatten
from keras.layers import Concatenate
from keras.preprocessing import image, sequence
from keras import optimizers
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

In [None]:
number_images_per_batch = 3
steps = len(train_descriptions) // number_images_per_batch
EPOCHS = 150

In [21]:
tf.config.run_functions_eagerly(True)

# 이미지 모델 정의
image_model = Sequential()
image_model.add(Dense(256, input_dim=4096, activation='relu'))

image_model.add(RepeatVector(max_words))

# 언어 모델 정의
lang_model = Sequential()
lang_model.add(Embedding(vocabulary_size, 256, input_length=max_words))
lang_model.add(LSTM(512, return_sequences=True))
lang_model.add(TimeDistributed(Dense(128)))

# 두 모델 합치기
model = Sequential()
merged = Concatenate()([image_model.output, lang_model.output])
lstm_layer = LSTM(1000, return_sequences=False)(merged)
output_layer = Dense(vocabulary_size, activation='softmax')(lstm_layer)
model = Model(inputs=[image_model.input, lang_model.input], outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(learning_rate=0.0001), metrics=['accuracy'])



# 조기 종료 콜백 정의 (검증 데이터 손실을 모니터링)
# early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [22]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 embedding_input (InputLayer)   [(None, 46)]         0           []                               
                                                                                                  
 dense_input (InputLayer)       [(None, 4096)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 46, 256)      204288      ['embedding_input[0][0]']        
                                                                                                  
 dense (Dense)                  (None, 256)          1048832     ['dense_input[0][0]']            
                                                                                              

In [None]:
from tqdm import tqdm

In [24]:
train_fit = data_generator(train_descriptions, train_image_paths ,train_ds, word_to_index_map, max_words, number_images_per_batch)

for i in tqdm(range(EPOCHS)):
  model.fit_generator(train_fit, epochs = 50, steps_per_epoch = steps, verbose = 1)

  model.fit_generator(data_generator(train_descriptions, train_image_paths ,train_ds, word_to_index_map, max_words, number_images_per_batch),epochs=1,steps_per_epoch=steps, verbose=1)


 3/40 [=>............................] - ETA: 3:58 - loss: 6.5859 - accuracy: 0.0088

  0%|          | 0/150 [00:25<?, ?it/s]


KeyboardInterrupt: 

In [None]:
model.save('../img_test_08/imagecaption.h5')

In [None]:
def generateCaption(image):
    in_text = 'startseq'
    for i in range(max_words):
        sequence = [word_to_index_map[w] for w in in_text.split() if w in word_to_index_map.keys()]
        sequence = pad_sequences([sequence], maxlen=max_words)
        yhat = model.predict([image, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = index_to_word_map[yhat]
        if word == 'endseq':
            break
        in_text += ' ' + word
    return in_text

### 반복 단어 제거

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
def preprocess_caption(caption):
    # 소문자로 변환
    caption = caption.lower()

    # 문장을 단어로 토큰화
    words = word_tokenize(caption)

    # 불용어(stopword) 제거
    words = [word for word in words if word not in stopwords.words('english')]

    # 중복 단어 제거
    unique_words = []
    for word in words:
        if word not in unique_words:
            unique_words.append(word)

    # 정제된 단어들을 다시 문장으로 결합
    cleaned_caption = ' '.join(unique_words)

    # 시작 및 끝 토큰 추가
    cleaned_caption = cleaned_caption + ' endseq'

    return cleaned_caption


### 이미지 및 캡션 결과 

In [None]:
import random
import matplotlib.pyplot as plt

In [None]:
for i in range(30):
  z = z = random.randint(0, len(test_ds) - 1)
  image_name = test_image_paths[z].decode('utf-8')  # 이미지 파일 이름 가져오기
  image = test_ds[z]
  image = image.reshape((1, 4096))

  image_path = os.path.join('../img_test_08/AI_Result_img', image_name)

  # 이미지 파일을 PIL 모듈을 사용하여 열고 변환
  pil_image = Image.open(image_path)

  plt.imshow(pil_image)  # 이미지를 바로 표시
  plt.show()
  caption_Re = generateCaption(image)
  for caption in captions:
    cleaned_caption = preprocess_caption(caption_Re)
  print("Caption:", cleaned_caption)
  print("--------------------------------------------------------------------------")