# EEE 443 - Final Project - Image Captioning:

## Group 10:

Ayhan Okuyan, Baris Akcin, Emre Donmez, Hasan Emre Erdemoglu, Ruzgar Eserol, Suleyman Taylan Topaloglu

### RNN Decoder & Image Captioning Notebook: (Part 2 of 2)

Note that in this section GPU will be utilized as training will be a cumbersome operation for the CPU. Some code is written to identify my GPU. 

1. Validate which directories that you work at, then import given dataset. The data extracted from the first notebook will be unpickled here. For each of the transfer learning encoder models (CNN models) that is extracted by Notebook 1, an RNN network can be tied to. 

    
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [3]:
import tensorflow as tf # tensorflow gpu capabilities available
from tensorflow.python.client import device_lib

print("Number of GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print('\n')
print(device_lib.list_local_devices())

Number of GPUs Available:  1


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5586630614588554977
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 4937233203
locality {
  bus_id: 1
  links {
  }
}
incarnation: 8443299653674864976
physical_device_desc: "device: 0, name: GeForce GTX 1060 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [4]:
import os

root_dir = os.getcwd()
exp_dir = root_dir

# Fetch and display the directory that we are working on:
print(os.getcwd())
print(os.listdir())

C:\Users\ayhok\Desktop\EEE443 Project
['.ipynb_checkpoints', 'Attention.ipynb', 'eee443_project_dataset_train.h5', 'exports', 'Image Downloader & Pickler.ipynb', 'img.zip', 'img_encodings.pkl', 'img_encodings_indices.pkl', 'RNN Decoder & Image Captioning Notebook_v2.ipynb', 'word_dict.pkl']


### Dataset Extraction & Unpickling:

In [6]:
import h5py
import pickle
import numpy as np

def eee443_dataset_read(path):
    f = h5py.File(path + '\\eee443_project_dataset_train.h5', 'r')
    train_cap = f['train_cap']
    train_imid = f['train_imid']
    train_url = f['train_url']
    word_code = f['word_code']
    train_ims = None
    return train_imid, train_cap, train_url, word_code, f

def extract_word_dict(dataset):
    words_struct = dataset.get('word_code')[()]
    word_list = list(words_struct.dtype.names) # returns the key
    w_all_idx = []
    for word in word_list:
        w_idx = int(words_struct[word])
        w_all_idx.append(w_idx)

    #print(w_all_idx)
    
    word_dict = dict(zip(w_all_idx, word_list))
    #print(word_dict[627]) # shop
    return word_dict

def unpickle_data(path, filename):
    file = pickle.load(open(path + filename,'rb'), encoding='utf8')
    return file

#### Dataset Reading & PKL File Extraction for CNN Encoders:

In [5]:
# Dataset reading: - Export word_dict too.
_, _, _, _, f = eee443_dataset_read(root_dir) # from previous notebook.
word_dict = extract_word_dict(f)

# Print if everything is alright or not.
print('Size of words dictionary: ', len(word_dict.keys()))
#print(word_dict)

Size of words dictionary:  1004


In [9]:
# Inception Data Unpickling:
enc_inception = unpickle_data(exp_dir, '\\img_encodings.pkl')
enc_inception_idx = unpickle_data(exp_dir, '\\img_encodings_indices.pkl')

# Verify sizes and all sorts of stuff:
print(type(enc_inception))
print(np.shape(enc_inception))
enc_inception = np.squeeze(enc_inception)
print('Encoded images final shape: ', np.shape(enc_inception))
print('\n')

print(type(enc_inception_idx))
print(np.shape(enc_inception_idx))
enc_inception_idx = np.squeeze(enc_inception_idx)
print('Encoded image indices final shape: ', np.shape(enc_inception_idx))
print(len(enc_inception_idx[300:360]))
print('\n')

enc_inception_dict = dict(zip(enc_inception_idx, enc_inception))
print('Inception encoding dictionary with query 10: ', enc_inception_dict['10']) # 10 is the 10.jpg here. 
print('Size of inception encoding dictionary with query 10: ', len(enc_inception_dict['10'])) # 10 is the 10.jpg here. 
print(len(enc_inception_dict))

<class 'list'>
(73724, 1, 2048)
Encoded images final shape:  (73724, 2048)


<class 'list'>
(73724,)
Encoded image indices final shape:  (73724,)
60


Inception encoding dictionary with query 10:  [0.0498227  0.10677398 0.05705195 ... 0.75969905 0.36796963 0.        ]
Size of inception encoding dictionary with query 10:  2048
73724


In [None]:
# VGG16 Data Unpickling: (to be implemented)

In [None]:
# Inception ResNet V2 Data Unpickling: (to be implemented)

### Caption Helper Functions:

These functions are used to extract numerical and textual captions from the given dataset. These will be useful in the later stages of the code.

In [10]:
import numpy as np

# Check the text data for images:
# Extracting data from dataset:
def fetch_captions(image_id, dataset, word_dict):
    # Query this on train_imid to extract which indices hold the captions for this image:
    imid = np.array(dataset.get('train_imid')[()])
    indices = np.where(imid == int(image_id))[0] # since everything is string, must be cast to int manually.
                                                 # indices is a tuple of array 

    # Extract the list of integer captions for the given image  
    all_caps = np.array(dataset.get('train_cap')[()])
    #print('Overall shape of the captions: ', all_caps.shape) 
    
    count = 1
    caps = []
    for idx in indices:
        cap = all_caps[idx][:]
        #print('Caption ', str(count), ': ', str(cap))
        caps.append(cap)
        count += 1
    
    #print('')
    #print('Captions: ', str(caps))  # Final look at the captions
    
    # Now do conversion:
    text_cap = []
    count = 0
    for item in caps:
        temp = []
        count += 1
        for word in item:
            #print(item)
            #print(str(word)  , (word_dict[word]))
            temp.append(word_dict[word])
        text_cap.append(temp)
        #print('Caption ', count, ' textual: ', (' '.join(map(str, temp))).split('x_NULL_')[0]) # list comprehension
        #temp.remove()
    
    #print(type(text_cap))
    # Return captions     
    caps = [list(c) for c in caps] # list comprehension to make arrays list
    return indices, text_cap, caps

# Helper function to print captions of an image:
def print_captions(caps, word_dict):
    # Only for displaying
    #print('Indices of captions for this image:', test_id)
    #print('\n')

    i = 0
    text_cap = []
    for item in caps:
        i += 1
        temp = []
        print('Caption ', i, ': ',  item)
        for word in item:
            #print(item)
            #print(str(word)  , (word_dict[word]))
            temp.append(word_dict[word])
        text_cap.append(temp)
        # x_NULL_ strings are only ignored, not erased from the captions
        print('Caption ', i, ' textual: ', (' '.join(map(str, temp))).split('x_NULL_')[0])
        print('\n')
    return

In [11]:
indices, text_cap, caps = fetch_captions(10, f, word_dict)

# Validation using image file: '10.jpg'
# Printing the image itself or its encoded output doesn't matter.
print(indices)
#print_captions(caps,word_dict)

print('Type of captions: ', type(caps))
print('Length of captions list: ', len(caps))
print('Type of one of the tokenized captions: ', type(caps[0]))

for c in caps:
    print(c)
    print(type(c))

[12376 12425 12565 12689 12809]
Type of captions:  <class 'list'>
Length of captions list:  5
Type of one of the tokenized captions:  <class 'list'>
[1, 16, 19, 8, 4, 61, 125, 107, 72, 18, 15, 3, 2, 0, 0, 0, 0]
<class 'list'>
[1, 4, 12, 8, 4, 3, 10, 254, 3, 93, 4, 185, 2, 0, 0, 0, 0]
<class 'list'>
[1, 4, 12, 8, 4, 61, 562, 72, 32, 18, 4, 238, 6, 328, 2, 0, 0]
<class 'list'>
[1, 4, 12, 9, 4, 3, 10, 4, 60, 189, 11, 460, 30, 61, 2, 0, 0]
<class 'list'>
[1, 4, 28, 507, 143, 7, 185, 6, 4, 61, 125, 2, 0, 0, 0, 0, 0]
<class 'list'>


### Generate training, validation and test sets:

1. Split to 80 % training, 10 % validation and 10% test data.
2. Upon successful training and validation train with 90 % of data keeping remaining 10% test data spared.
3. Do as much as random picking for the image encodings.


In [12]:
import random
import math

def divide_into_two(percentage, list_to_divide):
    indices = random.sample(range(1, len(list_to_divide)), len(list_to_divide)-math.ceil((1-percentage)*len(list_to_divide)))
    samples = []
    indices.sort(reverse=True) # reverse to do not alter original indices when removed
    for idx in indices:
            samples.append(list_to_divide[idx])
            list_to_divide.remove(list_to_divide[idx])
    # return back samples to original format
    samples.reverse()
    return samples, list_to_divide

In [13]:
TRAIN_PERCENTAGE = 0.8 # 80% train data, rest is 20% which is to be divided 10%/10%
VAL_PERCENTAGE = 0.5 # to seperate out 50% of remaining data

key_list = list(enc_inception_dict.keys())

train, rest = divide_into_two(TRAIN_PERCENTAGE, key_list)
validation, test = divide_into_two(VAL_PERCENTAGE, rest)
print('Train size:', len(train))
print('Validation size:',len(validation))
print('Test size:',len(test))

# Do further tests on training validation and test indices:
print('Train sample:', train[400:410])
print('Validation sample:', validation[400:410])
print('Test sample:', test[400:410])

Train size: 58979
Validation size: 7372
Test size: 7373
Train sample: ['10526', '10528', '10529', '1053', '10530', '10531', '10533', '10534', '10536', '10537']
Validation sample: ['14468', '14483', '14487', '14493', '14498', '14505', '14518', '1455', '1456', '14635']
Test sample: ['14058', '14108', '14114', '1412', '14127', '14134', '14149', '14176', '14193', '14194']


### Prepare the input data:

Each key with train, validation and test contains keys for encoding dictionary. Each key in the encoding corresponds to a single encoded image, and each encoded image corresponds to approximately 5 captions.

1. For each key:
    1.1. Replicate the encoding number of caption times. The size should be (5*key_size, 2048).
    1.2. Store all captions in a singly 2D list. The size should be (5*key_size, 2048).
2. Both encoding list and caption list will be aligned and these will be fed to the RNN decoder consecutively.

(In further implementations, to benefit from the GPU, we will try to get these inputs by batches, so this will be the final preperation before feeding the input to the network.)

In [14]:
from tqdm import tqdm
def prepare_data(dataset, encoding_dict, data_key_list, word_dict):
    encoding_dataset = []
    caption_dataset = []
    for key in tqdm(data_key_list): # Iterate over each key item in train/validation/test
        _, _, key_captions = fetch_captions(key, dataset, word_dict)
        len_captions_set = len(key_captions)
        # Replicate encoding number of caption times.
        for i in range(0,len_captions_set):
            encoding_dataset.append(encoding_dict[key])
        
        # Process the captions:
        for c in key_captions:
            caption_dataset.append(c)
            
    return encoding_dataset,caption_dataset

In [15]:
encoding_dataset,caption_dataset = prepare_data(f, enc_inception_dict, test, word_dict)

#Tests
print(len(encoding_dataset))
print(len(caption_dataset))

100%|██████████████████████████████████████████████████████████████████████████████| 7373/7373 [10:32<00:00, 11.66it/s]

35681
35681





In [16]:
# This process takes a long time.
enc_trn,cap_trn = prepare_data(f, enc_inception_dict, train, word_dict)
enc_val,cap_val = prepare_data(f, enc_inception_dict, validation, word_dict)
enc_tst,cap_tst = prepare_data(f, enc_inception_dict, test, word_dict)

100%|████████████████████████████████████████████████████████████████████████████| 58979/58979 [42:13<00:00, 23.28it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 7372/7372 [03:26<00:00, 35.76it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 7373/7373 [03:26<00:00, 35.66it/s]


In [17]:
# Check sizes on datasets - proofread material:
#Tests
print(len(enc_trn),print(len(cap_trn)))
print(len(enc_val),print(len(cap_val)))
print(len(enc_tst),print(len(cap_tst)))

285041
285041 None
35684
35684 None
35681
35681 None


In [18]:
print(len(cap_trn[0]))

17


### Data preperation checkpoint:

If everything works well until here, one may use these cells to save the dataset and export them layer for later use.

In [20]:
from pickle import dump
# If everything works properly, this is a checkpoint 
inception_export = root_dir + '\\exports'

if not os.path.exists(inception_export):
    os.mkdir(inception_export)

os.chdir(inception_export)
dump(enc_trn, open('enc_trn.pkl', 'wb'))
dump(enc_val, open('enc_val.pkl', 'wb'))
dump(enc_tst, open('enc_tst.pkl', 'wb'))
dump(cap_trn, open('cap_trn.pkl', 'wb'))
dump(cap_val, open('cap_val.pkl', 'wb'))
dump(cap_tst, open('cap_tst.pkl', 'wb'))
os.chdir(root_dir)

In [8]:
#Load data again:
root_dir = os.getcwd()
inception_export = root_dir + '\\exports\\'
enc_trn = unpickle_data(inception_export, 'enc_trn.pkl')
enc_val = unpickle_data(inception_export, 'enc_val.pkl')
enc_tst = unpickle_data(inception_export, 'enc_tst.pkl')
cap_trn = unpickle_data(inception_export, 'cap_trn.pkl')
cap_val = unpickle_data(inception_export, 'cap_val.pkl')
cap_tst = unpickle_data(inception_export, 'cap_tst.pkl')

In [9]:
# Check sizes on datasets - proofread material:
#Tests
print('enc_trn', len(enc_trn))
print('cap_trn', len(cap_trn))

enc_trn = np.array(enc_trn)
cap_trn = np.array(cap_trn)

print(enc_trn.shape)
print(cap_trn.shape)

enc_val = np.array(enc_val)
cap_val = np.array(cap_val)

enc_tst = np.array(enc_tst)
cap_tst = np.array(cap_tst)


enc_trn 285041
cap_trn 285041
(285041, 2048)
(285041, 17)


In [10]:
print(enc_trn[0])
print(cap_trn[0])

[0.0498227  0.10677398 0.05705195 ... 0.75969905 0.36796963 0.        ]
[  1  16  19   8   4  61 125 107  72  18  15   3   2   0   0   0   0]


In [11]:
# Form output labels:
def generate_labels(caps_set):
    cap_lbl = []
    for i in range(0, len(caps_set)):
        temp = caps_set[i][1:]
        temp = np.append(temp, 0)
        cap_lbl.append(temp)
    cap_lbl = np.array(cap_lbl)
    return cap_lbl

In [12]:
# Form label by shifting 1 unit. -testing:
cap_trn_lbl = generate_labels(cap_trn)
cap_val_lbl = generate_labels(cap_val)
cap_tst_lbl = generate_labels(cap_tst)

print(cap_trn.shape)
print(cap_trn_lbl.shape)


print(cap_trn[0])
print(cap_trn_lbl[0])


(285041, 17)
(285041, 17)
[  1  16  19   8   4  61 125 107  72  18  15   3   2   0   0   0   0]
[ 16  19   8   4  61 125 107  72  18  15   3   2   0   0   0   0   0]


# TODO AFTER HERE

# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
	X1, X2, y = list(), list(), list()
	# walk through each image identifier
	for key, desc_list in descriptions.items():
		# walk through each description for the image
		for desc in desc_list:
			# encode the sequence
			seq = tokenizer.texts_to_sequences([desc])[0]
			# split one sequence into multiple X,y pairs
			for i in range(1, len(seq)):
				# split into input and output pair
				in_seq, out_seq = seq[:i], seq[i]
				# pad input sequence
				in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
				# encode output sequence
				out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
				# store
				X1.append(photos[key][0])
				X2.append(in_seq)
				y.append(out_seq)
	return array(X1), array(X2), array(y)

### RNN Model Development:

In [13]:
def construct_batch(data_enc, data_cap, batch_start_index, batch_size):
    enc_batch = data_enc[batch_start_index:batch_start_index+batch_size]
    
    cap_batch = []
    for i in range(batch_start_index,batch_start_index+batch_size):
        cap_batch.append(data_cap[i])
    return enc_batch, cap_batch

In [14]:
enc_batch, cap_batch = construct_batch(enc_trn, cap_trn, 0, 512)

print(len(enc_batch))
print(len(cap_batch))
print(len(cap_batch[0]))

print(enc_batch[1])
print(cap_batch[1])

512
512
17
[0.0498227  0.10677398 0.05705195 ... 0.75969905 0.36796963 0.        ]
[  1   4  12   8   4   3  10 254   3  93   4 185   2   0   0   0   0]


In [15]:
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Embedding, GRU, add
from tensorflow.compat.v1.keras.layers import CuDNNLSTM

def lstm_block(enc_size, vocab_size, max_length, unit_size, drop_rate): # max_length: max length of captions
    # feature extractor model - Already calculated reduce size.
    tin = Input(shape=(enc_size,))
    f1 = Dropout(drop_rate)(tin)
    f2 = Dense(unit_size, activation='relu')(f1)
    
    # sequence model - this part is for the captions:
    cin = Input(shape=(max_length,))
    s1 = Embedding(vocab_size, unit_size, mask_zero=True)(cin)
    s2 = Dropout(drop_rate)(s1)
    s3 = LSTM(unit_size,return_sequences=True)(s2)
    s4 = LSTM(unit_size,return_sequences=True)(s3)
    s5 = LSTM(unit_size)(s4)
    # decoder model
    decoder1 = add([f2, s5])
    decoder2 = Dense(unit_size, activation='relu')(decoder1)
    outputs = Dense(max_length, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    model = tf.keras.Model(inputs=[tin,cin], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='Adam',metrics=[tf.keras.metrics.Accuracy()])
    
    # summarize model
    print(model.summary())
    return model

In [16]:
tf.keras.backend.clear_session()  # For easy reset of notebook state.
model = lstm_block(2048, 1004, 17, 256, 0.5)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 17)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 17, 256)      257024      input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 17, 256)      0           embedding[0][0]                  
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 2048)]       0                                            
______________________________________________________________________________________________

In [17]:
print('enc_trn         ', enc_trn.shape)
print('cap_trn         ', cap_trn.shape)
print('cap_trn_lbl     ', cap_trn_lbl.shape)
print('enc_val         ', enc_val.shape)
print('cap_val         ', cap_val.shape)
print('cap_val_lbl     ', cap_val_lbl.shape)

enc_trn          (285041, 2048)
cap_trn          (285041, 17)
cap_trn_lbl      (285041, 17)
enc_val          (35684, 2048)
cap_val          (35684, 17)
cap_val_lbl      (35684, 17)


In [18]:
model.fit([enc_trn[0:30000][:], cap_trn[0:30000][:]], cap_trn_lbl[0:30000][:], epochs=20, verbose=1, validation_data=([enc_val, cap_val], cap_val_lbl))

Train on 30000 samples, validate on 35684 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20


KeyboardInterrupt: 