<a href="https://colab.research.google.com/github/amirhoseinaghaei/Image-Captioning-/blob/main/Image_Captioning_For_Flicker_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing libraries

In [None]:
import os
import numpy as np 
import pickle
import tensorflow as tf
from tensorflow.keras import models , Sequential
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.models import Model
from tqdm import tqdm 
# from tensorflow.keras.preprocessing.text import Tokenizer 
# from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add, Conv2D , MaxPooling2D   , Flatten

## Getting dataset from google drive

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive/')
BASE_DIR = "gdrive/MyDrive/Flicker_Dataset"
WORKING_DIR  = "gdrive/MyDrive/Image_Captioning_Project "

## Load pretrained VGG16 model 

In [None]:
# Load VGG16 model 
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)
model.summary()

## Load Flicker dataset images and extract the features with VGG16 model 

In [None]:
# Loading the flicker-dataset images and extracting featueres
from tqdm import tqdm
features = {}
img_list = []
directory = os.path.join(BASE_DIR, "Images")
for img_name in tqdm(os.listdir(directory)):
  image_path = directory  + "/" + img_name
  image = load_img(image_path , target_size= (224,224))
  image = img_to_array(image) 
  image = image.reshape(1, image.shape[0], image.shape[1], image.shape[2])
  feature = model.predict(image)
  # img_list.append(image)
  image_id = img_name.split(".")[0]
  features[image_id]  = feature


## Save the extracted features 

In [25]:
# saving features dictionary in pkl file 
import pickle
pickle.dump(features , open(os.path.join(WORKING_DIR , "features.pkl"), "wb"))

## Loading Captions for each image

In [66]:
with open(os.path.join(BASE_DIR, "captions.txt"), 'r') as f: 
  next(f)
  captions = f.read()

In [86]:
mapping = {}
line = captions.split("\n")
for i in line:
  splitted = i.split(",")
  if len(line) < 2:
    continue
  img_id = splitted[0].split(".")[0]
  # print(splitted[1])
  caption = splitted[1:]
  caption = " ".join(caption)
  if img_id not in mapping.keys():
     mapping[img_id] = []
  # print(image_id + ": " + caption)
  mapping[img_id].append(caption)


## Defining the model

In [None]:
from tensorflow.python.keras.layers.merge import Add
from tensorflow.python.ops.gen_array_ops import InplaceAdd
from keras.backend import conv2d

from keras import optimizers
from tensorflow.python.ops.nn_ops import relu
def Build_CNN_FeatureExtractor(vocab_size):
      input1 = Input(shape=(224,224,3))
      conv1 = Conv2D(64, (3, 3), activation= tf.nn.relu, padding="same")(input1)
      conv2 = Conv2D(64, (3, 3), activation= tf.nn.relu, padding="same")(conv1)
      max1 =  MaxPooling2D((2,2), (2,2))(conv2)
      conv3 = Conv2D(128, (3, 3), activation= tf.nn.relu, padding="same")(max1)
      conv4 = Conv2D(128, (3, 3), activation= tf.nn.relu, padding="same")(conv3)
      max2 =  MaxPooling2D((2,2), (2,2))(conv4)
      conv5 = Conv2D(256, (3, 3), activation= tf.nn.relu, padding="same")(max2)
      conv6 = Conv2D(256, (3, 3), activation= tf.nn.relu, padding="same")(conv5)
      conv7 = Conv2D(256, (3, 3), activation= tf.nn.relu, padding="same")(conv6)
      max3 =  MaxPooling2D((2,2), (2,2))(conv7)
      conv8 = Conv2D(512, (3, 3), activation= tf.nn.relu, padding="same")(max3)
      conv9 = Conv2D(512, (3, 3), activation= tf.nn.relu, padding="same")(conv8)
      conv10 = Conv2D(512, (3, 3), activation= tf.nn.relu, padding="same")(conv9)
      max4 =  MaxPooling2D((2,2), (2,2))(conv10)
      conv11 = Conv2D(512, (3, 3), activation= tf.nn.relu, padding="same")(max4)
      conv12 = Conv2D(512, (3, 3), activation= tf.nn.relu, padding="same")(conv11)
      conv13 = Conv2D(512, (3, 3), activation= tf.nn.relu, padding="same")(conv12)
      max5 =  MaxPooling2D((2,2), (2,2))(conv13)
      flatten = Flatten()(max5)
      dense1 = Dense(4096, activation = tf.nn.relu)(flatten)
      dropout1 = Dropout(0.4)(dense1)
      dense2 = Dense(4096)(dropout1)
      dense3 = Dense(256)(dense2)
      input2 = Input(shape = (35,))
      embedding = Embedding(input_dim = vocab_size, output_dim = 256)(input2)
      dropout2 = Dropout(0.4)(embedding)
      lstm = LSTM(256)(dropout2)
      added = add([dense3, lstm])
      dense4 = Dense(256 , activation = tf.nn.relu)(added)
      output = Dense(vocab_size, activation = tf.nn.softmax)(dense4)
      model = Model(inputs = [input1 , input2], outputs = output)
      return model
 
vocab_size = 8600
model = Build_CNN_FeatureExtractor(vocab_size)
optimizer = tf.optimizers.Adam()
model.compile(optimizer = optimizer , loss = "categorical_crossentropy" , metrics = ["Accuracy"] )
model.summary()

## Plotting the model

In [None]:
plot_model(model= model)