In [14]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump,load
import tensorflow as tf
from tensorflow.keras.applications.xception import Xception,preprocess_input
from tensorflow.keras.preprocessing.image import load_img,img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import add
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.layers import Input,Dense,LSTM,Embedding,Dropout
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [15]:
def load_doc(filename):
    file=open(filename,'r')
    text=file.read()
    file.close()
    return text

In [16]:
def get_captions(filename):
    file=load_doc(filename)
    captions=file.split('\n')
    descriptions={}
    for caption in captions[:-1]:
        img,caption=caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]]=[caption]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions        

In [17]:
def clean_text(captions):
    table=str.maketrans('','',string.punctuation)
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):
            img_caption.replace("-"," ")
            desc=img_caption.split()
            
            #lowering Cases 
            desc=[word.lower() for word in desc]
            
            #removing punctuations
            desc=[word.translate(table) for word in desc]
            
            #removing hanging 's and a'
            desc=[word for word in desc if(len(word)>1)]
            
            #removing tokens with number in them
            desc=[word for word in desc if(word.isalpha())]
            
            #converting back to string 
            img_caption=' '.join(desc)
            captions[img][i]=img_caption
    return captions

In [18]:
def text_vocabulary(descriptions):
    #build vocabulary in all unique words
    vocab=set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab    

In [19]:
def save_descriptions(descriptions,filename):
    lines=list()
    for key,desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key+'\t'+desc)
    data='\n'.join(lines)
    file=open(filename,"w")
    file.write(data)
    file.close()

In [29]:
dataset_text="/home/altaf/Documents/Image_Captioning_Using_RNNs/Flickr_Data/Flickr_TextData"
dataset_images="/home/altaf/Documents/Image_Captioning_Using_RNNs/Flickr_Data/Images"

In [32]:
#preparing the text data
filename=dataset_text+"/"+"Flickr8k.token.txt"
#loading the file containing all datas
#mapping them into descriptions dictionary img to 5 captions
descriptions=get_captions(filename)
print("Length of descriptions=",len(descriptions))
#cleaning description
clean_descriptions=clean_text(descriptions)
#building Vocabulary
vocabulary=text_vocabulary(clean_descriptions)
print("Length of Vocabulary=",len(vocabulary))
#saving each description to file
save_descriptions(clean_descriptions,"descriptions.txt")


Length of descriptions= 8092
Length of Vocabulary= 8763


In [46]:
def extract_features(directory):
        model = Xception( include_top=False, pooling='avg' )
        features = {}
        for img in tqdm(os.listdir(directory)):
            filename = directory + "/" + img
            image = Image.open(filename)
            image = image.resize((299,299))
            image = np.expand_dims(image, axis=0)
            #image = preprocess_input(image)
            image = image/127.5
            image = image - 1.0
            
            feature = model.predict(image)
            features[img] = feature
        return features

In [47]:
#feature Vector
#features=extract_features(dataset_images)
#dump(features,open("features.p","wb"))

HBox(children=(IntProgress(value=0, max=8091), HTML(value='')))

In [48]:
features=load(open("features.p","rb"))

In [68]:
#loading data
def load_photos(filename):
    file=load_doc(filename)
    photos=file.split("\n")[:-1]
    return photos
def load_clean_descriptions(filename, photos):   
    #loading clean_descriptions
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("\n"):
        words = line.split()
        if len(words)<1 :
            continue
        image, image_caption = words[0], words[1:]
        
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> ' + " ".join(image_caption) + ' <end>'
            descriptions[image].append(desc)

    return descriptions
def load_features(photo):
    all_features=load(open("features.p","rb"))
    #selecting only needed features
    features={k:all_features[k] for k in photo}
    return features

In [69]:
filename=dataset_text+"/"+"Flickr_8k.trainImages.txt"
train_imgs=load_photos(filename)
train_descriptions=load_clean_descriptions("descriptions.txt",train_imgs)
train_features=load_features(train_imgs)

In [70]:
len(train_descriptions)

6000

In [71]:
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc
#creating tokenizer class this will vectorize text corpus
#each Integer will represent token in dictionary
from tensorflow.keras.preprocessing.text import Tokenizer
def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer


In [72]:
#each word is a Index so store it into tokenizer.p
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
vocab_size 

7577

In [73]:
def max_length(descriptions):
    desc_list=dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)
max_length=max_length(descriptions)
max_length

32

In [74]:
features['1000268201_693b08cb0e.jpg'][0]

array([0.35922778, 0.03271931, 0.03686453, ..., 0.13820367, 0.02857169,
       0.3070938 ], dtype=float32)

In [124]:
def data_generator(descriptions,features,tokenizer,max_length):
    while 1:
        for key,description_list in descriptions.items():
            #retreiving features
            feature=features[key][0]
            input_image,input_sequence,output_word=create_sequences(tokenizer,max_length,description_list,feature)
            yield[[input_image,input_sequence],output_word]
            
def create_sequences(tokenizer,max_length,desc_list,feature):
    X1,X2,y=list(),list(),list()
    #walk through each description for the image
    for desc in desc_list:
        #encode the sequence
        seq=tokenizer.texts_to_sequences([desc])[0]
        #split one sequence into multiple X,y pairs
        for i in range(1,len(seq)):
            in_seq,out_seq=seq[:i],seq[i]
            #pad input sequence
            in_seq=pad_sequences([in_seq],maxlen=max_length)[0]
            #encode output sequence
            out_seq=to_categorical([out_seq],num_classes=vocab_size)[0]
            #store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1),np.array(X2),np.array(y)

            

In [125]:
[a,b],c=next(data_generator(train_descriptions,features,tokenizer,max_length))
a.shape,b.shape,c.shape

((47, 2048), (47, 32), (47, 7577))

In [126]:
from tensorflow.keras.utils import plot_model
#defining the captioning model
def define_model(vocab_size,max_length):
    #features from the CNN model squeezzzed from 2048 to 256 nodes
    inputs1=Input(shape=(2048,))
    fe1=Dropout(0.5)(inputs1)
    fe2=Dense(256,activation='relu')(fe1)
    #LSTM sequence model
    inputs2=Input(shape=(max_length,))
    se1=Embedding(vocab_size,256,mask_zero=True)(inputs2)
    se2=Dropout(0.5)(se1)
    se3=LSTM(256)(se2)
    
    #Merging both models
    decoder1=add([fe2,se3])
    decoder2=Dense(256,activation='relu')(decoder1)
    outputs=Dense(vocab_size,activation='softmax')(decoder2)
    
    #tie it together [image,seq][word]
    model=Model(inputs=[inputs1,inputs2],outputs=outputs)
    model.compile(loss='categorical_crossentropy',optimizer='adam')
    
    #summarize model
    print(model.summary())
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [None]:
#train our model
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)

model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_descriptions)
# making a directory models to save our models
os.mkdir("models")
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit_generator(generator, epochs=1, steps_per_epoch= steps, verbose=1)
    model.save("models/model_" + str(i) + ".h5")
