In [1]:
import os
import pickle
import random
import time
import numpy as np
from keras_preprocessing.image import load_img, img_to_array

from nltk.translate.bleu_score import corpus_bleu
from tensorflow.python.keras.applications.densenet import DenseNet121, preprocess_input
from tensorflow.python.keras.models import load_model
from tensorflow.python.client import device_lib
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add

curr_folder = "D:/YandexDisk/datasets/"
end_dir = "D:/datasets/flickr-images-12k"

path_features = "D:/features-densenet121"
path_vocab = curr_folder + "ru-12k-vocab.pkl"
path_sentences = curr_folder + "ru-12k-sentences-train.pkl"
path_tokenizer = curr_folder + "ru-12k-tokenizer-train.pkl"

path_train_dict = curr_folder + "captions-ru-12k-train.pkl"
path_val_dict = curr_folder + "captions-ru-12k-val.pkl"

def image_names_set(data):
    vals = set()

    for idx in data.index:
        vals.add(data.iat[idx, 0][:-4])

    return vals

def load_image_features(filename, data):
    all_features = pickle.load(open(filename, 'rb'))
    features = {k: all_features[k] for k in data}

    return features

def to_lines(data):
    all_vals = list()
    for key in data.keys():
        [all_vals.append(d) for d in data[key]]

    return all_vals

def create_tokenizer(data):
    lines = to_lines(data)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)

    return tokenizer

def find_max_words(data):
    lines = to_lines(data)
    return max(len(l.split()) for l in lines)

def create_sequences(tokenizer, max_words, captions_list, image_name):
    X_image, X_text, y_word = list(), list(), list()
    vocab_size = len(tokenizer.word_index) + 1

    for caption in captions_list:
        seq = tokenizer.texts_to_sequences([caption])[0]

        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_words)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

            X_image.append(image_name)
            X_text.append(in_seq)
            y_word.append(out_seq)

    return X_image, X_text, y_word

def data_generator(tokenizer, max_words, data, images, batch_size, random_seed):
    count = 0
    random.seed(random_seed)

    img_names = list(data.keys())
    assert batch_size <= len(img_names), 'batch size must be less than or equal to {}'.format(len(img_names))

    while True:
        input_img_batch, input_seq_batch, output_word_batch = list(), list(), list()

        if count >= len(img_names):
            count = 0
        start_i = count
        end_i = min(len(img_names), count + batch_size)

        for i in range(start_i, end_i):
            curr_img = img_names[i]
            image = images[curr_img][0]
            captions_list = data[curr_img]
            random.shuffle(captions_list)

            input_img, input_seq, output_word = create_sequences(tokenizer, max_words, captions_list, image)

            for j in range(len(input_img)):
                input_img_batch.append(input_img[j])
                input_seq_batch.append(input_seq[j])
                output_word_batch.append(output_word[j])

        count = count + batch_size
        yield [np.array(input_img_batch), np.array(input_seq_batch)], np.array(output_word_batch)

def build_model(vocab_size, max_words):
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    inputs2 = Input(shape=(max_words,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    de1 = add([fe2, se3])
    de2 = Dense(256, activation='relu')(de1)
    outputs = Dense(vocab_size, activation='softmax')(de2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

with open (path_train_dict, 'rb') as f:
    train_dict = pickle.load(f)
train_features = load_image_features(path_features, train_dict)
print('кол-во подписей .............. %d' % len(train_dict))

with open (path_sentences, 'rb') as f:
    sentences = pickle.load(f)
tokenizer = create_tokenizer(train_dict)
vocab_size = len(tokenizer.word_index) + 1
print('размер словаря ............... %d' % vocab_size)

max_words = find_max_words(train_dict)
print('длина предложения в словах ... %d' % max_words)

кол-во подписей .............. 8262
размер словаря ............... 21391
длина предложения в словах ... 22


In [3]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1713692510423848579
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3129068339
locality {
  bus_id: 1
  links {
  }
}
incarnation: 4954210445840460104
physical_device_desc: "device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


# Признаки

In [14]:
def extract_features(directory):
    model = DenseNet121(weights="imagenet")
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    x = model.output
    x = Dense(4096, activation="relu")(x)
    x = Dense(4096, activation="relu")(x)
    model = Model(model.inputs, x)

    features = dict()
    for name in os.listdir(directory):
        filename = directory + '/' + name

        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)

        feature = model.predict(image, verbose=0)
        image_id = name.split('.')[0]
        features[image_id] = feature

    return features

In [15]:
%%time
features = extract_features(end_dir)
print('выделенные признаки ... %d' % len(features))
pickle.dump(features, open(path_features, 'wb'))

выделенные признаки ... 11804
Wall time: 25min 30s


# Обучение

In [18]:
epochs = 20
batch_size = 16

model = build_model(vocab_size, max_words)
steps = len(train_dict)/batch_size
if len(train_dict) % batch_size != 0:
    steps = steps + 1

start_time = time.time()
for i in range(epochs):
    generator = data_generator(tokenizer, max_words, train_dict, train_features, batch_size, 42)
    model.fit(generator,
              epochs=1, steps_per_epoch=steps,
              verbose=1)
    model.save('model-densenet121-' + str(i) + '.h5')
time_difference = time.time() - start_time

minutes = time_difference/60
print('время обучения в минутах ..... %d' % minutes)

время обучения в минутах ..... 96


# Оценка модели

In [11]:
with open (path_val_dict, 'rb') as f:
    val_dict = pickle.load(f)
val_features = load_image_features(path_features, val_dict)
print('кол-во подписей .............. %d' % len(val_dict))

val_tokenizer = create_tokenizer(val_dict)
val_vocab_size = len(val_tokenizer.word_index) + 1
print('размер словаря ............... %d' % val_vocab_size)

val_max_words = find_max_words(val_dict)
print('длина предложения в словах ... %d' % val_max_words)

def map_int_to_word(integer, tokenizer):
    for word, idx in tokenizer.word_index.items():
        if idx == integer:
            return word

    return None

def generate_caption(model, tokenizer, image, max_words):
    in_text = 'startseq'

    for i in range(max_words):
        seq = tokenizer.texts_to_sequences([in_text])[0]
        seq = pad_sequences([seq], maxlen=max_words)

        y_hat = model.predict([image,seq], verbose=0)
        y_hat = np.argmax(y_hat)

        word = map_int_to_word(y_hat, tokenizer)
        if word is None:
            break

        in_text += ' ' + word

        if word == 'endseq':
            break

    return in_text

def evaluate_model(model, captions, images, tokenizer, max_words):
    actual, predicted = list(), list()

    for key, captions_list in captions.items():
        references = [c.split() for c in captions_list]
        y_hat = generate_caption(model, tokenizer, images[key], max_words)

        actual.append(references)
        predicted.append(y_hat.split())

    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [14]:
path_model = 'D:/models/model-0.h5'
model = load_model(path_model)
evaluate_model(model, val_dict, val_features, tokenizer, max_words)

BLEU-1: 0.361409
BLEU-2: 0.193161
BLEU-3: 0.137490
BLEU-4: 0.055897
