In [None]:
%pip install kaggle transformers sentence-transformers torch torchvision numpy matplotlib pillow


In [None]:
# Instructions for MacOS/Linux Machine
!mkdir ~/.kaggle
!copy kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download sakshighadigaonkar/flickr-8k


In [None]:
!unzip flickr-8k.zip

In [None]:
# libraries
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
%matplotlib inline

import string
import os
from PIL import Image
from time import time

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn.functional as F

In [None]:
token_path = "./flickr-8k/Flickr8k_text/Flickr8k.token.txt"
train_images_path = './flickr-8k/Flickr8k_text/Flickr_8k.trainImages.txt'
test_images_path = './flickr-8k/Flickr8k_text/Flickr_8k.testImages.txt'
images_path = './flickr-8k/Flickr8k_Dataset/Flicker8k_Dataset'

doc = open(token_path,'r').read()

#  NLP setup

In [None]:
# 임베딩 가져오기
from sentence_transformers import SentenceTransformer, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("nvidia/NV-Embed-v2")
embedding_model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True)
embedding_model.max_seq_length = 32768
embedding_model.tokenizer.padding_side="right"
def add_eos(input_examples):
  input_examples = [input_example + embedding_model.tokenizer.eos_token for input_example in input_examples]
  return input_examples
vocab_size = 32000
max_length = 32
num_photos_per_batch = 1000
pad_sequences = tokenizer.pad_sequences



In [None]:


# 이미지 description 가져오기
from pathlib import Path
descriptions = dict()
for line in doc.split('\n'):
        tokens = line.split()
        if len(line) > 2:
          image_id = tokens[0].split('.')[0]
          image_desc = ' '.join(tokens[1:])
          if image_id not in descriptions:
              descriptions[image_id] = list()
          descriptions[image_id].append(image_desc)

# 이미지 description 조작하기
lines = list()
for key, desc_list in descriptions.items():
    for desc in desc_list:
        lines.append(key + ' ' + desc)
new_descriptions = '\n'.join(lines)

# train 이미지 가져오기
doc = open(train_images_path,'r').read()
dataset = list()
for line in doc.split('\n'):
    if len(line) > 1:
      identifier = line.split('.')[0]
      dataset.append(identifier)

train = set(dataset)


# train 이미지 description 가져오기
train_descriptions = dict()
for line in descriptions.split('\n'):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in train:
        if image_id not in train_descriptions:
            train_descriptions[image_id] = list()
        desc = '<s> ' + ' '.join(image_desc) + ' </s>' # adding the start and end sequence tokens for latter transformation
        train_descriptions[image_id].append(desc)

# Image Encoding


In [None]:
#In this case we are going to actually get the images and stored them in train_img list
img = list(Path(images_path).glob('*.jpg'))
train_images = set(open(train_images_path, 'r').read().strip().split('\n'))
train_img = []
for i in img:
    if i.split('/')[-1].split('\\')[-1] in train_images: ## .split('\\')[-1] this is added if you run this in a windows machine. Remove it if you are running in MacOS/Linux machine
        train_img.append(i)

test_images = set(open(test_images_path, 'r').read().strip().split('\n'))
test_img = []
for i in img: 
    if i.split('/')[-1].split('\\')[-1] in test_images:  ## .split('\\')[-1] this is added if you run this in a windows machine. Remove it if you are running in MacOS/Linux machine
        test_img.append(i)


In [None]:
def preprocess_image(image_path):
    image = Image.open(image_path)
    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return transform(image).unsqueeze(0)

vit_model = models.vit_b_32(pretrained=True)
vit = nn.Sequential(*list(vit_model.children())[:-1]) 
def encode_image(image_path):
    image = preprocess_image(image_path)
    with torch.no_grad():
        image = vit(image)
    return image


In [None]:
train_DL = torch.utils.data.DataLoader()

# DataLoader 만들기

In [None]:
# generates the actual input pair and output by combining each photo with each of its captions and use another caption as output. 
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            # retrieve the photo feature
            photo = photos[images_path + '\\' + key + '.jpg']
            for desc in desc_list:
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = F.onehot([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)

            if n==num_photos_per_batch:
                yield ([array(X1), array(X2)], array(y))
                X1, X2, y = list(), list(), list()
                n=0

# Image Captioning

In [None]:
class Image_Caption(nn.Module):
    def __init__(self, drop_p=0.5, image_dim=256, vocab_size=32000, embedding_dim=4096, hidden_dim=1024):
        super(Image_Caption, self).__init__()
        self.image_encoder = nn.Sequential(
            nn.Dropout(drop_p),
            nn.Linear(image_dim, 256),
            nn.ReLU(),
        )

        self.embedding = embedding_model
        self.text_dropout = nn.Dropout(drop_p)
        self.linear = nn.Linear(embedding_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, 256, batch_first=True)

        self.decoder = nn.Sequential(
            nn.Linear(256, vocab_size),
            nn.ReLu(),
            nn.Linear(vocab_size, vocab_size),
            nn.Softmax(dim=1)
        )
    def forward(self, image_features, text):
        image_features = self.image_encoder(image_features)
        with torch.no_grad():
            text_features = self.embedding.encode(add_eos(text), batch_size=1, prompt=False, normalize_embeddings=True)
        text_features = self.text_dropout(text_features)
        text_features = self.linear(text_features)
        lstm_out, _ = self.lstm(text_features)
        text_features = lstm_out[:, -1, :]
        fused_features = image_features + text_features

        output = self.decoder(fused_features)
        return output
    
model = Image_Caption()

vocab_size = 32000



In [None]:
wordtoix = tokenizer(word_index=True)
wordtoix = {k: v+1 for k, v in wordtoix.items()}
generator = data_generator(train_descriptions, train_img, wordtoix, max_length, num_photos_per_batch)
model.fit(generator, epochs=15, steps_per_epoch=100, verbose=1)
