In [27]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump,load

from keras.applications.xception import Xception,preprocess_input
from keras.preprocessing.image import load_img,img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import add
from keras.models import Model,load_model
from keras.layers import Input,Dense,LSTM,Embedding,Dropout

from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm().pandas()


0it [00:00, ?it/s]

In [28]:
#read file

def load_doc(filename):
    file=open(filename,'r')
    text=file.read()
    file.close()
    return text

#assign captions to each image

def all_img_captions(filename):
    file=load_doc(filename)
    captions=file.split("\n")
    descriptions={}
    for caption in captions[:-1]:
        img,caption=caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]]=[caption]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions


# Cleaning data
def cleaning_text(captions):
    table = str.maketrans('', '', string.punctuation)
    for img, caps in captions.items():
        for i, img_caption in enumerate(caps):
            img_caption.replace("-"," ")
            desc = img_caption.split()

            # Convert to lower case
            desc = [word.lower() for word in desc]
            # Remove punctuation
            desc = [word.translate(table) for word in desc]
            #Remove hanging 's and a
            desc = [word for word in desc if(len(word)>1)]
            # Remove tokens with numbers
            desc = [word for word in desc if word.isalpha()]
            # Convert back to string
            img_caption = ' '.join(desc)
            # Update the descriptions dictionary with the cleaned caption
            captions[img][i] = img_caption
    return captions


#build vocab of unique words
def text_vocab(descriptions):
    vocab=set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

#save all descriptions in one file
def save_descriptions(descriptions,filename):
    lines=list()
    for key,desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key+'\t'+desc)
    data="\n".join(lines)
    file=open(filename,"w")
    file.write(data)
    file.close()



In [33]:
datasetText='C:/Users/chawl/OneDrive/Desktop/MLBigdata/imgCaptioning/Flickr8k_text'
datasetImg='C:/Users/chawl/OneDrive/Desktop/MLBigdata/imgCaptioning/Flickr8k_Dataset/Flicker8k_Dataset'

In [34]:
filename=datasetText+"/"+'Flickr8k.token.txt'
descriptions=all_img_captions(filename)
print(len(descriptions))

8092


In [35]:
clean_descriptions=cleaning_text(descriptions)
vocab=text_vocab(clean_descriptions)
print(len(vocab))
save_descriptions(clean_descriptions,"descriptions.txt")

8763


In [36]:
def extractFeature(directory):
    model=Xception(include_top=False,pooling='avg')
    features={}
    for img in tqdm(os.listdir(directory)):
        filename=directory+"/"+img
        image=Image.open(filename)
        image=image.resize((299,299))
        image=np.expand_dims(image,axis=0)
        image=image/127.5
        image=image-1.0
        feature=model.predict(image)
        features[img]=feature
    return features

features=extractFeature(datasetImg)
dump(features,open("features.p","wb"))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for img in tqdm(os.listdir(directory)):


  0%|          | 0/8091 [00:00<?, ?it/s]



In [37]:
features=load(open("features.p","rb"))

In [38]:
def loadPhotos(filename):
    file = load_doc(filename)
    photos = file.split("\n")[:-1]
    return photos

def loadCleanDescriptions(filename, photos):
    file = load_doc(filename)
    # print(photos)
    descriptions = {}
    for line in file.split("\n"):
        words = line.split()
        # print(len(words))
        if len(words) < 1:
            continue
        image, imageCaption = words[0], words[1:]
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> ' + " ".join(imageCaption) + ' <end>'
            descriptions[image].append(desc)
        
    # print("Description keys:", list(descriptions.keys()))  # Print description keys
    return descriptions



# Add print statements for debugging
# def check_descriptions(descriptions):
#     for img, desc_list in descriptions.items():
#         print(f"Image: {img}, Number of Descriptions: {len(desc_list)}")
#         for i, desc in enumerate(desc_list):
#             print(f"Description {i + 1}: {desc}")


def loadFeatures(photos):
    allFeatures = load(open("features.p", "rb"))
    features={k:allFeatures[k] for k in photos}
    return features


# Define your filename
filename = datasetText+"/"+"Flickr_8k.trainImages.txt"



In [None]:
# # Check for filenames in trainImg that are not in descriptions.txt
# missing_descriptions = [img for img in trainImg if img not in trainDesc]
# print("Images without descriptions:")
# print(len(missing_descriptions))


Images without descriptions:
40459


In [None]:
# print(trainImg)



In [39]:
# Train
trainImg = loadPhotos(filename)
print("Number of Images:", len(trainImg))

# Load and process descriptions
trainDesc = loadCleanDescriptions("descriptions.txt", trainImg)
print("Number of Descriptions:", len(trainDesc))

# List images without descriptions
# for img in trainImg:
#     if img not in trainDesc:
#         print("Image without description:", img)

# Load image features
trainFeatures = loadFeatures(trainImg)
print("Number of Image Features:", len(trainFeatures))


Number of Images: 6000
Number of Descriptions: 6000
Number of Image Features: 6000


In [None]:
# descriptions = loadCleanDescriptions(filename, trainImg)
# print("Description keys:", list(descriptions.keys()))

1000268201_693b08cb0e.jpg	a child in a pink dress is climbing up a set of stairs in an entry way
1000268201_693b08cb0e.jpg	a girl going into a wooden building
1000268201_693b08cb0e.jpg	a little girl climbing into a wooden playhouse
1000268201_693b08cb0e.jpg	a little girl climbing the stairs to her playhouse
1000268201_693b08cb0e.jpg	a little girl in a pink dress going into a wooden cabin
1001773457_577c3a7d70.jpg	a black dog and a spotted dog are fighting
1001773457_577c3a7d70.jpg	a black dog and a tri colored dog playing with each other on the road
1001773457_577c3a7d70.jpg	a black dog and a white dog with brown spots are staring at each other in the street
1001773457_577c3a7d70.jpg	two dogs of different breeds looking at each other on the road
1001773457_577c3a7d70.jpg	two dogs on pavement moving toward each other
1002674143_1b742ab4b8.jpg	a little girl covered in paint sits in front of a painted rainbow with her hands in a bowl
1002674143_1b742ab4b8.jpg	a little girl is sitting in f

In [None]:
# descriptions.keys

<function dict.keys>

In [44]:
#tokenization
#create clean list of descriptions
def dictToList(descriptions):
    allDesc=[]
    for key in descriptions.keys():
        [allDesc.append(d) for d in descriptions[key]]
    return allDesc

#create token class which will tokenize each text
def createTokenizer(descriptions):
    descList=dictToList(descriptions)
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(descList)
    return(tokenizer)

In [45]:
#give each word an index and store that in tokenizer pickel file
tokenizer=createTokenizer(trainDesc)
dump(tokenizer,open('tokenizer.p','wb'))
vocabSize=len(tokenizer.word_index)+1
vocabSize

7577

In [46]:
#calculate max length of description
def maxLenDesc(descriptions):
    descList=dictToList(descriptions)
    return max(len(d.split()) for d in descList)

maxLength=maxLenDesc(descriptions)
maxLength

32

In [52]:
def dataGenerator(descriptions, features, tokenizer, maxLength):
    while 1:
        for key, descList in descriptions.items():
            feature = features[key][0]
            inputImage, inputSequence, outputWord = create_sequences(tokenizer, maxLength, descList, feature)
            yield [[inputImage, inputSequence], outputWord]

def create_sequences(tokenizer, maxLength, descList, feature):
    X1, X2, y = list(), list(), list()
    for desc in descList:  # Remove the parentheses here
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            inSeq, outSeq = seq[:i], seq[i]
            inSeq = pad_sequences([inSeq], maxlen=maxLength)[0]
            outSeq = to_categorical([outSeq], num_classes=vocabSize)[0]
            X1.append(feature)
            X2.append(inSeq)
            y.append(outSeq)
    return np.array(X1), np.array(X2), np.array(y)


In [53]:
[a,b],c = next(dataGenerator(trainDesc, features, tokenizer, maxLength))
a.shape, b.shape, c.shape

((47, 2048), (47, 32), (47, 7577))

In [54]:
# Defining RNN CNN Model
from keras.utils import plot_model
def defineModel(vocabSize,maxLength):
    input1=Input(shape=(2048,))
    fe1=Dropout(0.5)(input1)
    fe2=Dense(256,activation='relu')(fe1)

    input2=Input(shape=(maxLength,))
    se1=Embedding(vocabSize,256,mask_zero=True)(input2)
    se2=Dropout(0.5)(se1)
    se3=LSTM(256)(se2)

    decoder1=add([fe2,se3])
    decoder2=Dense(256,activation='relu')(decoder1)
    output=Dense(vocabSize,activation='softmax')(decoder2)

    model=Model(inputs=[input1,input2],outputs=output)
    model.compile(loss='categorical_crossentropy',optimizer='adam')

    print(model.summary())
    plot_model(model,to_file='model.png',show_shapes=True)

    return model

In [57]:
print('Dataset:',len(trainImg))
print('Descriptions: train=',len(trainDesc))
print('Photos: train=',len(trainFeatures))
print('Vocab size=',vocabSize)
print('Description Length=',maxLength)

model=defineModel(vocabSize,maxLength)
epochs=10
steps=len(trainDesc)
os.mkdir("models")
for i in range(epochs):
    generator=dataGenerator(trainDesc,trainFeatures,tokenizer,maxLength)
    model.fit_generator(generator,epochs=1,steps_per_epoch=steps,verbose=1)
    model.save("models/model"+str(i)+".h5")

Dataset: 6000
Descriptions: train= 6000
Photos: train= 6000
Vocab size= 7577
Description Length= 32
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_10 (InputLayer)       [(None, 32)]                 0         []                            
                                                                                                  
 input_9 (InputLayer)        [(None, 2048)]               0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 32, 256)              1939712   ['input_10[0][0]']            
                                                                                                  
 dropout_4 (Dropout)         (None, 2048)                 0         ['input_9[0][0]']      

  model.fit_generator(generator,epochs=1,steps_per_epoch=steps,verbose=1)




  saving_api.save_model(




0