## Import Modules

In [None]:
import os
import re
import joblib
from tqdm.notebook import tqdm
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.optimizers import Adam

directory = 'D:\CS_Internship\Exercises\Image Captioning'

## Read Captions

In [None]:
with open(os.path.join(directory, 'captions.txt'), 'r') as f:
    captions_lines = f.readlines()[1:]
    captions_file = ''.join(captions_lines)

### Dictionary of Images and their captions

In [None]:
image_captions = {}

for line in captions_file.split('\n'):
    tokens = line.split(',')
    if len(tokens) < 2:
        continue

    image_id, *caption_tokens = tokens
    image_id = image_id.split('.')[0]
    caption = " ".join(caption_tokens).strip()

    image_captions.setdefault(image_id, []).append(caption)

image_numbers = len(image_captions) 
print("Image Numbers:", image_numbers)

## Preprocess Captions

In [None]:
def clean_text(text):
    words = re.findall(r'[a-zA-Z]+', text.lower())
    words = [word for word in words if len(word) > 1]
    cleaned_text = "begin " + " ".join(words) + " end"
    return cleaned_text

for key, captions in image_captions.items():
    image_captions[key] = [clean_text(caption) for caption in captions]

### Captions informations

In [None]:
all_captions = [caption for key in image_captions for caption in image_captions[key]]
count_captions = len(all_captions)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index)

max_length = max(len(caption.split()) for caption in all_captions)

count_captions, vocab_size , max_length

## Using a pretrained model

In [None]:
base_model = VGG16()
base_model = Model(inputs=base_model.inputs, outputs=base_model.layers[-2].output)

print(base_model.summary())

### Extract image features

In [None]:
image_features = {}
image_directory= os.path.join(directory, 'Images')

for img_name in tqdm(os.listdir(image_directory)):
    img_path = os.path.join(image_directory, img_name)
    
    image = load_img(img_path, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    
    feature = base_model.predict(image)
    
    image_id = img_name.split('.')[0]
    image_features[image_id] = feature

### Save and Load extracted features

In [None]:
joblib.dump(image_features, os.path.join(directory, 'Extracted_Features.joblib'))
image_features = joblib.load(os.path.join(directory, 'Extracted_Features.joblib'))

## Split data into train and test

In [None]:
from sklearn.model_selection import train_test_split

image_ids = list(image_captions.keys())
train, test = train_test_split(image_ids, test_size=0.10, random_state=42)

len(train), len(test)

## Functions to generate, build and train model

In [None]:
def generator_data(data_keys, image_captions, image_features, tokenizer, max_length, vocab_size, batch_size):
    while True:
        selected_keys = np.random.choice(data_keys, size=batch_size, replace=True)
        X1, X2, y = [], [], []

        for key in selected_keys:
            captions = image_captions[key]
            selected_caption = np.random.choice(captions)

            seq = tokenizer.texts_to_sequences([selected_caption])[0]

            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                X1.append(image_features[key][0])
                X2.append(in_seq)
                y.append(out_seq)

        yield [np.array(X1), np.array(X2)], np.array(y)

In [None]:
def build_model(max_length, vocab_size):
    image_input = Input(shape=(4096,))
    text_input = Input(shape=(max_length,))

    i1 = Dense(256, activation='relu')(Dropout(0.4)(image_input))
    t1 = LSTM(256)(Dropout(0.4)(Embedding(vocab_size, 256, mask_zero=True)(text_input)))

    decoder = Dense(256, activation='relu')(add([i1, t1]))
    outputs = Dense(vocab_size, activation='softmax')(decoder)

    model = Model(inputs=[image_input, text_input], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer=Adam())

    return model

In [None]:
def train_model(model, train_data, image_captions, image_features, tokenizer, max_length, vocab_size, batch_size, epochs):
    steps_per_epoch = len(train_data) // batch_size

    for _ in range(epochs):
        generator = generator_data(train_data, image_captions, image_features, tokenizer, max_length, vocab_size, batch_size)
        model.fit(generator, epochs= 1, steps_per_epoch= steps_per_epoch, verbose=1)

### Apply functions and save model

In [None]:
epochs = 20
batch_size = 32

model = build_model(max_length, vocab_size)
train_model(model, train, image_captions, image_features, tokenizer, max_length, vocab_size, batch_size, epochs)

In [None]:
model.save(directory+'/image_captioning_model20_32.keras')

## Predict and Generate Caption

In [None]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def predict_caption(model, image, tokenizer, max_length):
    
    in_text = 'begin:'
    for i in range(max_length):
        
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], max_length)
        y = model.predict([image, sequence])
        y = np.argmax(y)
        word = idx_to_word(y, tokenizer)
        if word is None:
            break
        in_text += " " + word
        if word == 'end':
            break  
    return in_text

def generate_caption(image_name):
    img_path = os.path.join(directory+"\images", image_name)
    image = Image.open(img_path)
    plt.imshow(image)
    
    captions = image_captions[image_id]
    print('Real Captions:')
    for caption in captions:
        print(caption)

    y_pred = predict_caption(model, image_features[image_id], tokenizer, max_length)
    print('Predicted Caption:','\n', y_pred)