In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
import numpy as np
from numpy import array
import matplotlib.pyplot as plt

import os
from sklearn.model_selection import train_test_split

from keras import Input, layers, optimizers
from keras.models import Model
from keras.layers import LSTM, Embedding, Dense,Dropout, LeakyReLU
from keras.layers.merge import add
from keras.optimizers import Adam
from keras.utils import to_categorical

from lib.data.flickr_dataset import FlickrDataset
from lib.data.feature_vector_generator import FeatureVectorGenerator
from lib.data.glove_word_embedding_generator import GloveWordEmbeddingGenerator
from lib.data.generator.data_generator import DataGenerator

from lib.model.model_wrapper import ModelWrapper
from lib.model.metrics import rmse
from lib.model.greedy_search import GreedySearch

from lib.utils.word_utils import word_to_index_and_index_to_word
from lib.utils.file_utils import create_directory
from lib.utils.plot_utils import show_sample
from lib.utils.array_utils import column
from lib.utils.pickle_utils import save_obj, load_obj

from keras.callbacks import ModelCheckpoint
from lib.callback.metric_plotter import MetricsPlotter
from lib.callback.adam_learning_rate_tracker import AdamLearningRateTracker

In [5]:
dataset_name='8k'
# dataset_name='30k'
data_path = f'./dataset/{dataset_name}/data'
images_path = f'./dataset/{dataset_name}/images'
image_features_path = f'./dataset/{dataset_name}/img_features.pkl'
word_embedding_path = './dataset/glove.6B.200d.txt'
line_separator = { 
    '8k': r'#[0-9]',
    '30k': r'\| [0-9]\|'
}
weights_path = create_directory(f'weights/{dataset_name}')
weights_file_path_patern = weights_path + '/weights__epoch_{epoch:02d}__loss_{val_loss:.4f}__acc_{val_rmse:.4f}.h5'

In [6]:
dataset = FlickrDataset(
    data_path, 
    images_path,
    desc_prefix='$', 
    desc_postfix='#',
    clean_desc=True,
    separator=line_separator[dataset_name]
)

Max len desc: $ an africanamerican man wear green sweatshirt and blue vest be hold up dollar bill in front of his face while stand on busy sidewalk in front of group of man play instrument #


In [7]:
train_samples, remain_samples = train_test_split(dataset.samples(), test_size=0.3)
val_samples, test_samples = train_test_split(remain_samples, test_size=0.1)
print(f'Train: {len(train_samples)}, Val: {len(val_samples)}, Test: {len(test_samples)}')

Train: 5663, Val: 2185, Test: 243


In [8]:
train_samples[0]

('./dataset/8k/images/3298175192_bbef524ddc.jpg',
 ['$ black and white dog be chew on camera #',
  '$ black and white dog rest its head on camera #',
  '$ puppy play with camera #',
  '$ black and white dog chew on canon camera set in the grass the camera be black and white #',
  '$ black and white puppy gnaws camera #'])

In [9]:
if not os.path.isfile(image_features_path):
    image_paths = dataset.samples(col=0)
    image_features = list(FeatureVectorGenerator().generate(image_paths))
    save_obj(image_features_path, image_features)

image_features = load_obj(image_features_path)
print(image_features[0])
image_features = dict(image_features)

('./dataset/8k/images/1305564994_00513f9a5b.jpg', array([ 0.06508786,  0.03218709,  0.0237698 , ...,  0.3836866 ,
        0.19910577,  0.23510413], dtype=float32))


In [10]:
min_occurs=10
vocabulary = dataset.words_set(min_occurs=10)
vocabulary_size = len(vocabulary)
complete_vocabulary_size = len(dataset.words_set())
print(f'Words(occurs>={min_occurs}): {vocabulary_size}/{complete_vocabulary_size}')

Words(occurs>=10): 1593/6688


In [11]:
word_to_index, index_to_word = word_to_index_and_index_to_word(vocabulary)

In [12]:
len(word_to_index)

1593

In [13]:
dataset.max_desc_len()

174

In [None]:
embedding_vector_dim = 200
embedding_generator = GloveWordEmbeddingGenerator(word_embedding_path, embedding_vector_dim)

In [None]:
embedding_matrix = embedding_generator.generate(word_to_index)

In [None]:
len(embedding_matrix)

In [None]:
def build_model(vocabulary_size, embedding_vector_dim, embedding_matrix, optimizer):
    img_feat_input = Input(name="Image_Feature", shape=(2048,))

    img_branch = Dropout(0.5)(img_feat_input)
    img_branch = Dense(512)(img_branch)
    img_branch = LeakyReLU(alpha=0.3)(img_branch)

    seq_input = Input(name="Description_Sequence", shape=(dataset.max_desc_len(),))

    seq_branch = Embedding(vocabulary_size, embedding_vector_dim, mask_zero=True)(seq_input)
    seq_branch = Dropout(0.5)(seq_branch)
    seq_branch = LSTM(512)(seq_branch)

    decoder = add([img_branch, seq_branch])
    decoder = Dense(512)(decoder)
    decoder = LeakyReLU(alpha=0.3)(decoder)
    
    outputs = Dense(name="Words_Distribution", units=vocabulary_size, activation='softmax')(decoder)

    model = Model(inputs=[img_feat_input, seq_input], outputs=outputs)

    model.layers[3].set_weights([embedding_matrix])
    model.layers[3].trainable = False

    model.compile(
        loss='categorical_crossentropy', 
        optimizer=optimizer,
        metrics=[rmse]
    )

    return ModelWrapper(model)

In [None]:
checkpoint = ModelCheckpoint(
    weights_file_path_patern,
    monitor='val_loss', 
    verbose=1,
    save_best_only=True, 
    save_weights_only=True, 
    mode='auto',
    period=1
)
lr_tracker = AdamLearningRateTracker()

In [None]:
# epochs, batch_size, lr = 6, 10, 0.001
epochs, batch_size, lr = 6, 10, 0.0001

In [None]:
train_generator = DataGenerator(
    train_samples,
    image_features,
    word_to_index,
    index_to_word,
    dataset.max_desc_len(), 
    vocabulary_size,
    batch_size=batch_size
)

val_generator = DataGenerator(
    val_samples,
    image_features,
    word_to_index,
    index_to_word,
    dataset.max_desc_len(), 
    vocabulary_size,
    batch_size=batch_size
)

In [None]:
model = build_model(vocabulary_size, embedding_vector_dim, embedding_matrix, Adam(lr=lr))
# model.show()

In [None]:
model.load(f'{weights_path}/weights__epoch_05__loss_3.1049__acc_0.0223.h5')

In [None]:
model.fit(
    train_generator,
    val_generator,
    epochs=epochs,
    steps_per_epoch=len(train_samples)/batch_size,
    callbacks=[
        checkpoint,
        lr_tracker,
        MetricsPlotter(val_generator, plot_interval=100, evaluate_interval=100, batch_size=batch_size)
    ]
)

In [None]:
search = GreedySearch(model, word_to_index, index_to_word, '$', '#', dataset.max_desc_len())

import random
image_path = test_samples[random.randint(0, len(test_samples)-1)][0]

show_sample(
    image_path,
    description=search.perform( image_features[image_path])
)