In [2]:
import os
import shutil
import pickle
import string
import pandas as pd

from tensorflow.python.client import device_lib
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model

In [3]:
curr_folder = "D:/YandexDisk/datasets/"

start_dir = "D:/datasets/flickr-images-30k"
end_dir = "D:/datasets/flickr-images-12k"

path_captions = curr_folder + "captions-ru-12k.csv"
path_captions_no_puncts = curr_folder + "captions-ru-12k-no-puncts.csv"

path_train = curr_folder + "captions-ru-12k-train.csv"
path_val = curr_folder + "captions-ru-12k-val.csv"
path_test = curr_folder + "captions-ru-12k-test.csv"

path_features = curr_folder + "ru-12k-features.pkl"
path_vocab = curr_folder + "ru-12k-vocab.pkl"
path_sentences = curr_folder + "ru-12k-sentences-train.pkl"

path_train_dict = curr_folder + "captions-ru-12k-train.pkl"
path_val_dict = curr_folder + "captions-ru-12k-val.pkl"

# Удалить длинные предложения

In [18]:
df = pd.read_csv(path_captions, sep='|')

In [21]:
idxs_to_del = []

for idx in df.index:
    curr_len = len(df.iat[idx, 2])
    if curr_len > 100:
        idxs_to_del.append(idx)

df_to_delete = df.loc[idxs_to_del, :]
images_to_delete = df_to_delete["image_name"].unique()

In [26]:
print(df.shape)
df = df[~df.image_name.isin(images_to_delete)]
print(df.shape)

(77875, 3)
(59020, 3)


# Подготовка данных к обучению

– Каждое слово с маленькой буквы
– Удалить знаки препинания
– Только буквы

In [7]:
def clean_captions(data):
    table = str.maketrans('', '', string.punctuation)

    for idx in data.index:
        curr_capt = data.iat[idx, 2]
        curr_capt = curr_capt.split()
        curr_capt = [word.lower() for word in curr_capt]
        curr_capt = [word.translate(table) for word in curr_capt]
        curr_capt = [word for word in curr_capt if word.isalpha()]

        data.iat[idx, 2] = ' '.join(curr_capt)

In [7]:
df = pd.read_csv(path_captions, delimiter='|')
df.head(5)

Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,два молодых парня с лохматыми волосами смотрят...
1,1000092795.jpg,1,два молодых белых самца находятся снаружи возл...
2,1000092795.jpg,2,двое мужчин в зеленых рубашках стоят во дворе
3,1000092795.jpg,3,мужчина в синей рубашке стоящий в саду
4,1000092795.jpg,4,два друга наслаждаются временем проведенным вм...


In [36]:
clean_captions(df)
df.head()

Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,два молодых парня с лохматыми волосами смотрят...
1,1000092795.jpg,1,два молодых белых самца находятся снаружи возл...
2,1000092795.jpg,2,двое мужчин в зеленых рубашках стоят во дворе
3,1000092795.jpg,3,мужчина в синей рубашке стоящий в саду
4,1000092795.jpg,4,два друга наслаждаются временем проведенным вм...


# Словарь

In [39]:
def to_vocab(data):
    vocab = set()

    for idx in data.index:
        vocab.update(data.iat[idx, 2].split())

    return vocab

In [43]:
df = pd.read_csv(path_captions_no_puncts, sep='|')
vocab = to_vocab(df)
print('размер словаря ... %d' % len(vocab))

размер словаря ... 25271


In [46]:
with open(path_vocab,'wb') as f:
    pickle.dump(vocab, f)

In [47]:
with open(path_vocab,'rb') as f:
    loaded_vocab = pickle.load(f)

# Разбить набор на три части

In [67]:
df = pd.read_csv(path_captions_no_puncts, delimiter='|')
df.head(5)

Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,два молодых парня с лохматыми волосами смотрят...
1,1000092795.jpg,1,два молодых белых самца находятся снаружи возл...
2,1000092795.jpg,2,двое мужчин в зеленых рубашках стоят во дворе
3,1000092795.jpg,3,мужчина в синей рубашке стоящий в саду
4,1000092795.jpg,4,два друга наслаждаются временем проведенным вм...


In [68]:
n = len(df)

train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

train_df.to_csv(path_train, sep='|', encoding='utf-8', index=False)
val_df.to_csv(path_val, sep='|', encoding='utf-8', index=False)
test_df.to_csv(path_test, sep='|', encoding='utf-8', index=False)

# Добавить начальные и конечные строки startseq и endseq

In [69]:
def add_start_end_tags(data):
    for idx in data.index:
        curr_str = data.iat[idx, 2]
        data.iat[idx, 2] = 'startseq ' + curr_str + ' endseq'

In [71]:
train_df = pd.read_csv(path_train, sep='|')
val_df = pd.read_csv(path_val, sep='|')
test_df = pd.read_csv(path_test, sep='|')

add_start_end_tags(train_df)
add_start_end_tags(val_df)
add_start_end_tags(test_df)

train_df.to_csv(path_train, sep='|', encoding='utf-8', index=False)
val_df.to_csv(path_val, sep='|', encoding='utf-8', index=False)
test_df.to_csv(path_test, sep='|', encoding='utf-8', index=False)

In [72]:
test_df.iat[0, 2]

'startseq группа мужчин идущих по льду с тем что выглядит как антарктида или один из полюсов endseq'

# Конвертация pandas.dataframe в dict

In [4]:
def to_dict(data):
    out_dict = dict()

    start_index = 0
    end_index = len(data) - 1 - 5

    for idx in range(start_index, end_index+1, 5):
        image_name = data.iat[idx, 0][:-4]
        curr_list = list()

        curr_list.append(data.iat[idx, 2])
        curr_list.append(data.iat[idx+1, 2])
        curr_list.append(data.iat[idx+2, 2])
        curr_list.append(data.iat[idx+3, 2])
        curr_list.append(data.iat[idx+4, 2])

        out_dict[image_name] = curr_list

    return out_dict

In [3]:
df_to_convert = pd.read_csv(path_val, sep='|')
df_to_convert.head()

In [8]:
new_dict = to_dict(df_to_convert)

In [9]:
with open(path_val_dict,'wb') as f:
    pickle.dump(new_dict, f)

In [13]:
with open (path_val_dict, 'rb') as f:
    test_dict = pickle.load(f)

# Предложения для обучения

In [4]:
def to_sentences(data):
    all_sentences = list()

    for idx in data.index:
        all_sentences.append(data.iat[idx, 2])

    return all_sentences

In [5]:
train_df = pd.read_csv(path_train, delimiter='|')
sentences = to_sentences(train_df)
print(len(sentences))

41315


In [6]:
with open(path_sentences, 'wb') as f:
    pickle.dump(sentences, f)

In [14]:
with open (path_sentences, 'rb') as f:
    list_sentences = pickle.load(f)

# Выбрать и скопировать изображения

In [56]:
df = pd.read_csv(path_captions, sep='|')
images_unique = df["image_name"].unique()
print(images_unique.shape)
print(df.shape)

(11804,)
(59020, 3)


In [59]:
for image_name in images_unique:
    curr_image = start_dir + '/' + image_name
    copied_image = end_dir + '/' + image_name
    shutil.copy2(curr_image, copied_image )

counter = len(os.listdir(path=end_dir))
print("скопировано изображений ... " + str(counter))

скопировано изображений ... 11804


# Извлечение признаков

In [60]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13655461543632212713
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3129068339
locality {
  bus_id: 1
  links {
  }
}
incarnation: 12884065117983840751
physical_device_desc: "device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [63]:
def extract_features(directory):
    model = VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

    features = dict()
    for name in os.listdir(directory):
        filename = directory + '/' + name

        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)

        feature = model.predict(image, verbose=0)
        image_id = name.split('.')[0]
        features[image_id] = feature

    return features

In [64]:
%%time
features = extract_features(end_dir)
print('выделенные признаки ... %d' % len(features))
pickle.dump(features, open(path_features, 'wb'))

выделенные признаки ... 11804
Wall time: 18min 47s
