# Importing Libraries-

In [None]:
from numpy import argmax, array
from pickle import load, dump
from keras.layers.merge import add
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.image import load_img, img_to_array
from keras.models import load_model, Model
from keras.utils import to_categorical
from keras.applications.vgg16 import VGG16, preprocess_input

# Make some important functions-

## For loading the image input data

In [None]:
def IMAGE_TEXT_LOAD(FNAME):
    
	file = open(FNAME, 'r')
	doc = file.read()
	file.close()
    
	DATA = list()
    
	for line in doc.split('\n'):
		if len(line) < 1: # Not including empty lines
			continue
		
		identifier = line.split('.')[0] # Take the unique image Splited by a DOT.
		DATA.append(identifier)
        
	return set(DATA) #returning the list


## Adding sequence tokens

In [None]:
def MATCH_DESC_ADD_TOKEN(FNAME, DATA):
    
	file = open(FNAME, 'r')
	doc = file.read()
	file.close()
    
	REFINED_DESC = dict()
    
	for line in doc.split('\n'):
        
		tokens = line.split() # split line by every space taking each word
		image_id, image_desc = tokens[0], tokens[1:] # Taking ID from description
        
		# skip images not in the description set
		if image_id in DATA:
			if image_id not in REFINED_DESC:
				REFINED_DESC[image_id] = list() #empty list where its not present
                
			# Adding tokens in description with relevent images
			desc = 'SOS ' + ' '.join(image_desc) + ' EOS' # SOS= START_OF_SEQUENCE | EOS=END_OF SEQUENCE
			REFINED_DESC[image_id].append(desc) # appending it to the dictionary
            
	return REFINED_DESC


## Making dictionary to Tokenise (and its reverse) and Max Sequence length

In [None]:
def WORD_TO_ID(descriptions):
    # Tokenize words given in caption descriptions
	lines = list()
	for key in descriptions.keys():
		[lines.append(d) for d in descriptions[key]]
	
	tokenizer = Tokenizer.fit_on_texts(lines)
	return tokenizer

def ID_TO_WORD(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

def MAX_SEQUENCE_LENGTH(descriptions):
    lines = list()
    for key in descriptions.keys():
        [lines.append(d) for d in descriptions[key]]
        
    return max(len(d.split()) for d in lines)


## For making description 

In [None]:
def CREATE_DESCRIPTION(model, tokenizer, photo, max_length):

	in_text = 'SOS' # Start the generation process
	for i in range(max_length): # iterating over whole sequence
		sequence = tokenizer.texts_to_sequences([in_text])[0] # integer encoding
		sequence = pad_sequences([sequence], maxlen=max_length) # padding input
		yhat = model.predict([photo,sequence], verbose=0) # predicting next word
		yhat = argmax(yhat) # converting probability to integer
		word = ID_TO_WORD(yhat, tokenizer) # map token to word
		if word is None:
			break
		in_text += ' ' + word # append as input for creating next word
		if word == 'EOS': # stop if we forecast the EOS
			break
        
	return in_text


## For making sequences of I/P's and O/P's

In [None]:
def MAKE_SEQUENCES(tokenizer, max_length, descriptions, photos):
    # Make I/P sequences of text and image sequences and O/P words for an image
	X1, X2, y = list(), list(), list()
	
	for key, desc_list in descriptions.items(): # Itirate over each image 
		
		for desc in desc_list: # Go through each description for an image
			
			seq = tokenizer.texts_to_sequences([desc])[0] # encoding sequence
			
			for i in range(1, len(seq)): #Spliting to I/P and O/P
				
				in_seq, out_seq = seq[:i], seq[i] # Make a pair
				
				in_seq = pad_sequences([in_seq], maxlen=max_length)[0] # Input Padding
				
				out_seq = to_categorical([out_seq], num_classes=vocab_size)[0] # Encode O/P
				
                # Appending it
				X1.append(photos[key][0])
				X2.append(in_seq)
				y.append(out_seq)
                
	return array(X1), array(X2), array(y)


## Lets define our CNN and RNN model

In [None]:
def MODEL_DEFINITION(vocab_size, max_length):
	# TAKING BOTH INPUTS--
    # For Image
	inputs1 = Input(shape=(4096,))
	fe1 = Dropout(0.5)(inputs1)
	fe2 = Dense(256, activation='relu')(fe1)
    
	# For Text - Sequence model
	inputs2 = Input(shape=(max_length,))
	se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
	se2 = Dropout(0.5)(se1)
	se3 = LSTM(256)(se2)
    
	# Decoder Part
	decoder1 = add([fe2, se3])
	decoder2 = Dense(256, activation='relu')(decoder1)
	outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
	# tie all and Feeding to NN
	model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    # Compiling the Model
	model.compile(loss='categorical_crossentropy', optimizer='adam')
    
	return model


## For generating sets of I/O used while training

In [None]:
def GENERATE_DATA(descriptions, photos, tokenizer, max_length):
	
	while 1: # Itirate  over every image 
		for key, desc_list in descriptions.items():
            
			# Get Image features
			photo = photos[key][0]
			in_img, in_seq, out_word = MAKE_SEQUENCES(tokenizer, max_length, desc_list, photo)
            
			yield [[in_img, in_seq], out_word]



## Preparing Training data

In [None]:
# load training DATA (6K)
FNAME1 = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = IMAGE_TEXT_LOAD(FNAME1)

# descriptions
train_descriptions = MATCH_DESC_ADD_TOKEN('descriptions.txt', train)
# prepare tokenizer
tokenizer = WORD_TO_ID(train_descriptions)

# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

#taking training features
Load_all_features = load(open('features.pkl', 'rb')) # taking all features given in Binary Read Mode
train_features = {k: Load_all_features[k] for k in train}


vocab_size = len(tokenizer.word_index) + 1

# determine the maximum sequence length
max_length = MAX_SEQUENCE_LENGTH(train_descriptions)

# prepare sequences
X1train, X2train, ytrain = MAKE_SEQUENCES(tokenizer, max_length, train_descriptions, train_features)

## Preparing Testing data

In [None]:
# load test set
FNAME2 = 'Flickr8k_text/Flickr_8k.testImages.txt'
test = IMAGE_TEXT_LOAD(FNAME2)
# descriptions
test_descriptions = MATCH_DESC_ADD_TOKEN('descriptions.txt', test)

# photo features
# Lets see image features-
Load_all_features = load(open('features.pkl', 'rb')) # taking all features given in Binary Read Mode
test_features = {k: Load_all_features[k] for k in test}

# prepare sequences
X1test, X2test, ytest = MAKE_SEQUENCES(tokenizer, max_length, test_descriptions, test_features)



## Fitting the Model

In [None]:
model = MODEL_DEFINITION(vocab_size, max_length) # Defining the Model 

EPOCHS = 10
STEPS = len(train_descriptions)

for i in range(EPOCHS):
	# Generate Data
	generator = GENERATE_DATA(train_descriptions, train_features, tokenizer, max_length)
    
	# fitting for each epoch
	model.fit_generator(generator, epochs=1, steps_per_epoch=STEPS, verbose=1)
    
	# saving model
	model.save('model_' + str(i) + '.h5')

# load the model
filename = 'model_09.h5'
model = load_model(filename)


# Getting caption of a sample image

## First get the Image features

In [None]:
def TAKE_IMAGE_FEATURES(FNAME):
	# load the model
	model = VGG16()
	
	model.layers.pop() # Modifying our model
	model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    
	image = load_img(FNAME, target_size=(224, 224)) # getting the photo
    
	image = img_to_array(image) # change image to a numpy array
    
	# reshaping data as an input to the model
	image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    
	image = preprocess_input(image) # Getting the image for VGG model
    
	# At last, getting features
	feature = model.predict(image, verbose=0)
    
	return feature


## Lets get the captions 

In [None]:
# Getting the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# pre-set the max seq length getting after training
max_length = 34 

# load the model
model = load_model('model_09.h5')

# load and prepare the photograph
photo = TAKE_IMAGE_FEATURES('Some_image.jpg')

# generating description
description = CREATE_DESCRIPTION(model, tokenizer, photo, max_length)
print(description)
