##### Notatki
Tutorial tutaj: https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8

Dane Flickr 8k:
https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip

Polskie podpisy: http://zil.ipipan.waw.pl/Scwad/AIDe?action=AttachFile&do=get&target=AIDe_ANNOTATED_DESCRIPTIONS.zip

In [38]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import string
import os
from PIL import Image
import glob
from pickle import dump, load
from time import time
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from statistics import mean
from unidecode import unidecode
import nltk
nltk.download('wordnet')
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\macie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
learn_mode = 'lemma_no_pl'

In [40]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

doc = load_doc(f'../../AIDe_ANNOTATED_DESCRIPTIONS/token_{learn_mode}.txt')

In [41]:
# create dictionary 'mapping' that contains all names of images without 
# .jpg extention and their corresponding descriptons
def load_descriptions(doc):
	mapping = dict()
	# process lines
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		if len(line) < 2:
			continue
		# take the first token as the image id, the rest as the description
		image_id, image_desc = tokens[0], tokens[1:]
		# extract filename from image id
		image_id = image_id.split('.')[0]
		# convert description tokens back to string
		image_desc = ' '.join(image_desc)
		# create the list if needed
		if image_id not in mapping:
			mapping[image_id] = list()
		# store description
		mapping[image_id].append(image_desc)
	return mapping

descriptions = load_descriptions(doc)
print(learn_mode, 'loaded: %d ' % len(descriptions))

lemma_no_pl loaded: 1000 


In [42]:
# prepare translation table by removing numbers, non-letter characters and by lowercasing the text
def clean_descriptions(descriptions):
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenize
			desc = desc.split()
			# convert to lower case
			desc = [word.lower() for word in desc]
			# remove punctuation from each token
			desc = [w.translate(table) for w in desc]
			# remove tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# store as string
			desc_list[i] =  ' '.join(desc)

clean_descriptions(descriptions)

In [43]:
# Function for creating vocabulary of all words present from loaded descriptions
def to_vocabulary(descriptions):
	# build a list of all description strings
	all_desc = set()
	for key in descriptions.keys():
		[all_desc.update(d.split()) for d in descriptions[key]]
	return all_desc

vocabulary = to_vocabulary(descriptions)
print(learn_mode, 'vocabulary Size: %d' % len(vocabulary))

lemma_no_pl vocabulary Size: 3237


In [44]:
# load a pre-defined list of photo identifiers
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

train = []
test = []

for i in range(0,5):
    train.append(load_set(f'randomState/trainImages{i}.txt'))
    test.append(load_set(f'randomState/testImages{i}.txt'))


In [45]:
# Below path contains all the images
images = '../../Flickr8k_Dataset/'
# Create a list of all image names in the directory
img = glob.glob(images + '*.jpg')

In [46]:
train_images = []
train_img = []

# Read the train image names in a set
for i in range(0,5):
    train_images.append(set(open(f'randomState/trainImages{i}.txt', 'r').read().strip().split('\n')))
    # Create a list of all the training images with their full path names
    train_img.append([])

    for im in img: # img is list of full path names of all images
        if im[len(images):] in train_images[i]: # Check if the image belongs to training set
            train_img[i].append(im) # Add it to the list of train images

In [47]:
test_images = []
test_img = []

# Read the train image names in a set
for i in range(0,5):
    test_images.append(set(open(f'randomState/testImages{i}.txt', 'r').read().strip().split('\n')))

    # Create a list of all the training images with their full path names
    test_img.append([])

    for im in img: # img is list of full path names of all images
        if im[len(images):] in test_images[i]: # Check if the image belongs to training set
            test_img[i].append(im) # Add it to the list of train images

In [48]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset, isForTraining):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			if isForTraining:
				desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			else:
				desc = ' '.join(image_desc)
			# store
			descriptions[image_id].append(desc)
	return descriptions

train_descriptions = []
test_descriptions = []
no_tokens_train_descriptions = []

# descriptions
for i in range(0,5):
    train_descriptions.append(load_clean_descriptions(f'../descriptions_{learn_mode}.txt', train[i], True))
    print(f'Descriptions: train {i} = %d' % len(train_descriptions[i]))
    test_descriptions.append(load_clean_descriptions(f'../descriptions_{learn_mode}.txt', test[i], False))
    print(f'Descriptions: test {i} = %d' % len(test_descriptions[i]))
    no_tokens_train_descriptions.append(load_clean_descriptions(f'../descriptions_{learn_mode}.txt', train[i], False))
    print(f'Descriptions: train no tokens {i} = %d' % len(no_tokens_train_descriptions[i]))

Descriptions: train 0 = 700
Descriptions: test 0 = 300
Descriptions: train no tokens 0 = 700
Descriptions: train 1 = 700
Descriptions: test 1 = 300
Descriptions: train no tokens 1 = 700
Descriptions: train 2 = 700
Descriptions: test 2 = 300
Descriptions: train no tokens 2 = 700
Descriptions: train 3 = 700
Descriptions: test 3 = 300
Descriptions: train no tokens 3 = 700
Descriptions: train 4 = 700
Descriptions: test 4 = 300
Descriptions: train no tokens 4 = 700


In [58]:
def preprocess(image_path):
    # Convert all the images to size 299x299 as expected by the inception v3 model
    img = image.load_img(image_path, target_size=(299, 299))
    # Convert PIL image to numpy array of 3-dimensions
    x = image.img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis=0)
    # preprocess the images using preprocess_input() from inception module
    x = preprocess_input(x)
    return x

In [59]:
# Load the inception v3 model
model = InceptionV3(weights='imagenet')

In [60]:
# Create a new model, by removing the last layer (output layer) from the inception v3
model_new = Model(model.input, model.layers[-2].output)

In [61]:
# Function to encode a given image into a vector of size (2048, )
def encode(image):
    image = preprocess(image) # preprocess the image
    fea_vec = model_new.predict(image) # Get the encoding vector for the image
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )
    return fea_vec

In [62]:
train_features = []

for i in range(0,5):
    train_features.append(load(open(f'Pickle/encoded_train_images_{learn_mode}_{i}.pkl', 'rb')))
    print(f'Photos: train {i} = %d' % len(train_features[i]))

Photos: train 0 = 700
Photos: train 1 = 700
Photos: train 2 = 700
Photos: train 3 = 700
Photos: train 4 = 700


In [63]:
# Create a list of all the training captions
all_train_captions = []

for i in range(0,5):
    all_train_captions.append([])
    for key, val in train_descriptions[i].items():
        for cap in val:
            all_train_captions[i].append(cap)
    print(len(all_train_captions[i]))

1400
1400
1400
1400
1400


In [64]:
# Consider only words which occur at least 10 times in the corpus
word_count_threshold = 10
word_counts = []
vocab = []

for i in range(0,5):
    word_counts.append({})
    nsents = 0
    for sent in all_train_captions[i]:
        nsents += 1
        for w in sent.split(' '):
            word_counts[i][w] = word_counts[i].get(w, 0) + 1

    vocab.append([w for w in word_counts[i] if word_counts[i][w] >= word_count_threshold])
    print('preprocessed words %d -> %d' % (len(word_counts[i]), len(vocab[i])))

preprocessed words 2115 -> 236
preprocessed words 2100 -> 244
preprocessed words 2077 -> 245
preprocessed words 2075 -> 245
preprocessed words 2054 -> 244


In [65]:
ixtoword = []
wordtoix = []

for i in range(0,5):
    ixtoword.append({})
    wordtoix.append({})

    ix = 1
    for w in vocab[i]:
        wordtoix[i][w] = ix
        ixtoword[i][ix] = w
        ix += 1 

In [66]:
vocab_size = []

for i in range(0,5):
    vocab_size.append(len(ixtoword[i]) + 1) # one for appended 0's

vocab_size

[237, 245, 246, 246, 245]

In [67]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# calculate the length of the description with the most words
def get_max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

max_length = []

# determine the maximum sequence length
for i in range(0,5):
    max_length.append(get_max_length(train_descriptions[i]))
    print('Description Length: %d' % max_length[i])

Description Length: 31
Description Length: 31
Description Length: 36
Description Length: 36
Description Length: 36


Embedings pobrane stąd: https://github.com/sdadas/polish-nlp-resources/releases/download/v1.0/glove.zip

In [69]:
# Load Glove vectors
glove_dir = '../../glove'
embeddings_index = []

for i in range(0,5):
    embeddings_index.append({}) # empty dictionary

f = open(os.path.join(glove_dir, 'glove_100_3_polish.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    coefs = np.asarray(values[1:], dtype='float32')
    for i in range(0,5):
        word = values[0] if learn_mode != 'no_pl' else unidecode(values[0])
        embeddings_index[i][word] = coefs


In [71]:
embedding_dim = 100
embedding_matrix = []

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
for i in range(0,5):
    embedding_matrix.append(np.zeros((vocab_size[i], embedding_dim)))

    for word, j in wordtoix[i].items():
        #if i < max_words:
        embedding_vector = embeddings_index[i].get(word)
        if embedding_vector is not None:
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i][j] = embedding_vector

In [72]:
model = []

for i in range(0,5):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    inputs2 = Input(shape=(max_length[i],))
    se1 = Embedding(vocab_size[i], embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size[i], activation='softmax')(decoder2)
    model.append(Model(inputs=[inputs1, inputs2], outputs=outputs))

In [73]:
for i in range(0,5):
    model[i].layers[2].set_weights([embedding_matrix[i]])
    model[i].layers[2].trainable = False

In [74]:
for i in range(0,5):
    model[i].compile(loss='categorical_crossentropy', optimizer='adam')

In [75]:
for i in range(0,5):
    model[i].load_weights(f'model_weights/model_30_{learn_mode}_{i}.h5')

In [77]:
images = '../../Flickr8k_Dataset/'

In [80]:
encoding_train = []
encoding_test = []

#load image features from pickle files
for i in range(0,5):
    with open(f'Pickle/encoded_train_images_{learn_mode}_{i}.pkl', 'rb') as encoded_pickle:
        encoding_train.append(load(encoded_pickle))
    with open(f'Pickle/encoded_test_images_{learn_mode}_{i}.pkl', 'rb') as encoded_pickle:
        encoding_test.append(load(encoded_pickle))

In [88]:
#Function for generating captions for image. It is greedy function because it predicts next best possible word 
#in sentence given image features and previous predicted word in the sentence. Alghorithm stops when it reaches
#max possible length of caption or the word endseq.

def greedy_search(photo, index):
    in_text = 'startseq'
    for i in range(max_length[index]):
        sequence = [wordtoix[index][w] for w in in_text.split() if w in wordtoix[index]]
        sequence = pad_sequences([sequence], maxlen=max_length[index])
        yhat = model[index].predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ixtoword[index][yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [83]:
#sources:
#https://www.researchgate.net/publication/268689555_CIDEr_Consensus-based_Image_Description_Evaluation
#https://www.sciencedirect.com/topics/computer-science/cosine-similarity
#https://www.freecodecamp.org/news/how-to-process-textual-data-using-tf-idf-in-python-cd2bbc0a94a3/

#function for calculating cider metric
def cider(given_captions, prediction):
    prediction_arr = []
    prediction_arr.append(prediction)
    vectorizer = TfidfVectorizer()
    trainedV = vectorizer.fit(given_captions)
    givenTfIdf = trainedV.transform(given_captions).toarray()
    predTfIdf = trainedV.transform(prediction_arr).toarray()
    partialCiders = 0
    help_arr = []
    
    for c in givenTfIdf:
        help_arr.append(c)
        partialCiders = partialCiders + cosine_similarity(predTfIdf,help_arr)
        help_arr = []
        
    return partialCiders/len(givenTfIdf)

In [37]:
# Bag of words = bow

def get_bow(description):
    bow = description.split(' ')
    bow = set(bow)
    return bow

#calculate bow score
def bow_score(bow_train, bow_pred):
    overlap = bow_train & bow_pred
    universe = bow_train | bow_pred

    result1 = float(len(overlap)) / len(bow_train) * 100
    result2 = float(len(overlap)) / len(bow_pred) * 100
    result3 = float(len(overlap)) / len(universe) * 100
    return result1, len(overlap)

In [90]:
import warnings; warnings.simplefilter('ignore')
from IPython.display import clear_output

#preparation of metrics lists
bleu_train_scores = []
meteor_train_scores = []
cider_train_scores = []

bleu_test_scores = []    
meteor_test_scores = []
cider_test_scores = []
    
for i in range(0,5):
    bleu_train_scores.append([])
    meteor_train_scores.append([])
    cider_train_scores.append([])

    bleu_test_scores.append([])
    meteor_test_scores.append([])
    cider_test_scores.append([])
    
    train_pics = list(encoding_train[i].keys())
    test_pics = list(encoding_test[i].keys())

    #calculation of metrics for train images dataset
    for j in range(0,len(train_pics)):
        pic = train_pics[j]
        image = encoding_train[i][pic].reshape((1,2048))

        actual_desc_0 = no_tokens_train_descriptions[i][pic[:-4]][0]
        actual_desc_1 = no_tokens_train_descriptions[i][pic[:-4]][1]
        generated = greedy_search(image,i)
        bleu_train_scores[i].append(
            sentence_bleu(
                [actual_desc_0.split(), actual_desc_1.split()],
                generated.split())
        )
        meteor_train_scores[i].append(meteor_score(no_tokens_train_descriptions[i][pic.split('.')[0]],generated))
        cider_train_scores[i].append(cider(no_tokens_train_descriptions[i][pic.split('.')[0]],generated)[0][0])

        clear_output(wait=True)
        print('train',i,j, flush=True)

    #calculation of metrics for test images dataset
    for j in range(0,len(test_pics)):
        pic = test_pics[j]
        image = encoding_test[i][pic].reshape((1,2048))

        actual_desc_0 = test_descriptions[i][pic[:-4]][0]
        actual_desc_1 = test_descriptions[i][pic[:-4]][1]
        generated = greedy_search(image,i)
        bleu_test_scores[i].append(
            sentence_bleu(
                [actual_desc_0.split(), actual_desc_1.split()],
                generated.split())
        )
        meteor_test_scores[i].append(meteor_score(test_descriptions[i][pic.split('.')[0]],generated))
        cider_test_scores[i].append(cider(test_descriptions[i][pic.split('.')[0]],generated)[0][0])

        clear_output(wait=True)
        print('test',i,j, flush=True)


result_data = {}

#calculate mean score based on socres list for each metric
for i in range(0,5):
    result_data[str(i)] = [
        mean(bleu_train_scores[i]),
        mean(bleu_test_scores[i]),
        mean(meteor_train_scores[i]),
        mean(meteor_test_scores[i]),
        mean(cider_train_scores[i]),
        mean(cider_test_scores[i])
    ]

result = pd.DataFrame.from_dict(
    result_data,
    orient='index',
    columns=['Bleu TREN', 'Bleu TEST', 'Meteor TREN', 'Meteor TEST', 'CIDEr TREN', 'CIDEr TEST']
)

result

test 4 299


Unnamed: 0,Bleu TREN,Bleu TEST,Meteor TREN,Meteor TEST,CIDEr TREN,CIDEr TEST
0,0.043925,0.008942,0.190126,0.12849,0.28182,0.227051
1,0.024209,0.005275,0.172188,0.110724,0.279981,0.226442
2,0.033593,0.00539,0.203894,0.149129,0.304074,0.253831
3,0.023446,0.009099,0.16578,0.12272,0.256607,0.22214
4,0.040174,0.006343,0.204667,0.130822,0.304884,0.243957
