In [369]:
# import modules
import numpy as np
import tensorflow as tf
import pickle
import gzip
import random
import time
from keras.preprocessing.sequence import pad_sequences

In [42]:
# return image feature dict, caption dict
def load_data(dtype):
    if dtype == 'train':
        filename = './Flickr8k_text/Flickr_8k.trainImages.txt'
    elif dtype == 'valid':
        filename = './Flickr8k_text/Flickr_8k.devImages.txt'
    elif dtype == 'test':
        filename = './Flickr8k_text/Flickr_8k.testImages.txt'
        
    image_ids = []
    features = dict()
    captions = dict()
    
    # get image ids
    with open(filename, 'r') as f:
        lines = f.read().split('\n')
        image_ids = [x.split('.')[0] for x in lines if x] # remove '.jpg'
        
    # get image features
    with gzip.open('./image_features.pkl.zip', 'rb') as f:
        all_featuers = pickle.load(f)
        features = { x: all_featuers[x][0] for x in image_ids }
        
    # get image cpations
    with open('./captions.txt', 'r') as f:
        lines = f.read().split('\n')
        for l in lines:
            tokens = l.split(' ')
            image_id = tokens[0]
            caption = ' '.join(tokens[1:])
            if image_id in captions:
                captions[image_id].append(caption)
            else:
                captions[image_id] = [caption]
            
            
    return features, captions

In [43]:
train_features, train_captions = load_data('train')
valid_features, valid_captions = load_data('valid')
test_features, test_captions = load_data('test')

print('# of images in train:', len(train_features))
print('# of images in valid:', len(valid_features))
print('# of images in test:', len(test_features))

print('feature shape:', list(train_features.items())[0][1].shape)

# of images in train: 6000
# of images in valid: 1000
# of images in test: 1000
feature shape: (4096,)


In [44]:
# caption에 있는 word 중 5번 이상 등장한 word만 사용하여
# word_to_idx, idx_to_word dict를 만듬
# word를 vector화 하여 input으로 주기 위함
# return word_to_idx, idx_to_word, vocab_size

def preprocess_vocab(captions):
    threshold = 20 # threshold of occurence
    count = dict()
    word_to_idx, idx_to_word = dict(), list()

    for caption_list in captions.values():
        for c in caption_list:
            tokens = c.split(' ')
            for t in tokens:
                if t in count:
                    count[t] += 1
                else:
                    count[t] = 1
            
    print('# total of words:', len(count))
    print('# of words that appears >= {}:'.format(threshold), len([w for w in count if count[w] >= threshold]))

    idx_to_word = [w for w in count if count[w] >= threshold]
    idx_to_word.append('<START>')
    idx_to_word.append('<END>')
    for i, w in enumerate(idx_to_word):
        word_to_idx[w] = i
        
    return word_to_idx, idx_to_word, len(idx_to_word)
    
    

In [111]:
# vocab_size: 우리가 사용할 총 단어의 수
# 각 caption을 index vector로 encoding할 수 있음
word_to_idx, idx_to_word, vocab_size = preprocess_vocab(train_captions)
print('vocab_size:', vocab_size)

# total of words: 8764
# of words that appears >= 20: 1289
vocab_size: 1291


In [291]:
# 각 caption들을 숫자로 encoding함
# caption의 최대길이를 구함
# return encoded caption dict, max_len
def encode_caption(word_to_idx, captions):
    maxlen = 0
    result = dict()
    for image_id in captions:
        caption_list = captions[image_id]
        _caption_list = [] # encoded caption list
        for c in caption_list:
            tokens = c.split(' ')
            embeded = []
            for t in tokens:
                # word_to_idx에 없는 단어는 무시
                if t in word_to_idx:
                    embeded.append(word_to_idx[t])

            maxlen = max(maxlen, len(embeded))
            embeded.insert(0, word_to_idx['<START>'])
            embeded.append(word_to_idx['<END>'])
            _caption_list.append(embeded)
            
        result[image_id] = _caption_list
            
    return result, maxlen

In [292]:
encoded_train_captions, maxlen = encode_caption(word_to_idx, train_captions)
print('maximum length of encoded caption:', maxlen)
print('embeded caption example')
print(random.choice(list(encoded_train_captions.items())))

maximum length of encoded caption: 29
embeded caption example
('3524612244_64f00afec5', [[1289, 592, 558, 937, 453, 528, 544, 264, 505, 1290], [1289, 541, 454, 1227, 320, 916, 898, 684, 416, 454, 1194, 1290], [1289, 558, 937, 725, 439, 1290], [1289, 28, 528, 11, 583, 21, 898, 302, 937, 1290], [1289, 390, 403, 725, 1093, 772, 937, 544, 588, 1290]])


In [360]:
# return loss, optimizer, pred
# vocab_size: 사용하는 총 단어의 수
# maxlen: 문장에 등장하는 단어의 최대 횟수
# hidden_size: lstm cell의 hidden state의 크기이자 output vector의 크기
def build_model(vocab_size, hidden_size, learning_rate, embedding_size):
    x_img = tf.placeholder(tf.float32, [None, 4096]) # image feature size = 4096
    x_seq = tf.placeholder(tf.int32, [None, None]) # 
    y_seq = tf.placeholder(tf.int32, [None, None]) # 
    
    img_embedding = tf.layers.dense(x_img, embedding_size, activation=tf.nn.relu) # 4096 to embedding_size
    w_word_embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0)) #  vocab_size to embedding_size
    word_embedding = tf.nn.embedding_lookup(w_word_embedding, x_seq)
#     print('img_embedding:', img_embedding.shape)
#     print('word_embedding:', word_embedding.shape)
    lstm = tf.nn.rnn_cell.LSTMCell(hidden_size)
    init_state = lstm.zero_state(batch_size=tf.shape(x_seq)[0], dtype=tf.float32)
#     print('init_state:', init_state)
    _, state = lstm(img_embedding, init_state)
#     print('vocab_size:', vocab_size)
#     print('x_seq.shape:', x_seq.shape)
    outputs, state = tf.nn.dynamic_rnn(lstm, word_embedding, initial_state=state)
    
    logits = tf.layers.dense(outputs, vocab_size, activation=tf.nn.relu)
    targets = tf.one_hot(y_seq, vocab_size)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=targets))
    loss = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    pred = tf.argmax(logits, axis=0)

    
    return x_img, x_seq, y_seq, loss, optimizer, pred
    

In [367]:
def train(encoded_train_captions, train_features, embedding_size, epochs):
    image_ids = train_features.keys()
    tf.reset_default_graph()
    x_img, x_seq, y_seq, loss, optimizer, pred = build_model(vocab_size, hidden_size, 
                                                         learning_rate, embedding_size)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
    
        for i in range(epochs):
            print('epoch_{}'.format(i+1))
            start = time.time()
            for i, image_id in enumerate(image_ids):
                img_feature = train_features[image_id]
                img_feature = np.expand_dims(img_feature, axis=0)
                captions = encoded_train_captions[image_id]
#                 print('img_feature.shape:', img_feature.shape)
                for c in captions:
                    c = np.array(c)
                    c = np.expand_dims(c, axis=0)
                    _x_seq = c[:, :-1]
                    _y_seq = c[:, 1:]
#                     print('_y_seq:', _y_seq.shape)
                    feed = {
                        x_img: img_feature,
                        x_seq: c[:, :-1],
                        y_seq: c[:, 1:]
                    }
                    l, _ = sess.run([loss, optimizer], feed)
                    
                print('iter: {}/{}, loss:{}'.format(i+1, len(image_ids), l), end='\r')
                
            end = time.time()
            print('Elapsed for this epoch:', end-start, 'sec')
                    


In [370]:
learning_rate = 0.001
hidden_size = 256
epochs = 10
embedding_size = 128

train(encoded_train_captions, train_features, embedding_size, epochs)

epoch_1
iter: 2051/6000, loss:5.5774159431457525

KeyboardInterrupt: 