In [1]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, Input, Dropout, Add
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
import numpy as np
import os
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
def load_descriptions(file_path):
    descriptions = {}
    with open(file_path, 'r') as f:
        for line in f:
            tokens = line.strip().split('\t')
            if len(tokens) < 2:
                continue
            img_id, caption = tokens[0], tokens[1]
            img_id = img_id.split('.')[0]
            if img_id not in descriptions:
                descriptions[img_id] = []
            descriptions[img_id].append(caption.lower())
    return descriptions

def preprocess_images(image_folder):
    model = VGG16(weights='imagenet')
    model = Model(inputs=model.input, outputs=model.layers[-2].output)  # Use penultimate layer
    features = {}
    for img_name in os.listdir(image_folder):
        img_path = os.path.join(image_folder, img_name)
        img = load_img(img_path, target_size=(224, 224))
        img = img_to_array(img)
        img = np.expand_dims(img, axis=0)
        img = preprocess_input(img)
        features[img_name.split('.')[0]] = model.predict(img)[0]
    return features

# Load and preprocess data
descriptions = load_descriptions('Flickr8k_Dataset/captions.txt')
image_features = preprocess_images('Flickr8k_Dataset/images/')


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


KeyboardInterrupt: 

In [None]:
# Combine captions and tokenize
all_captions = []
for key in descriptions.keys():
    all_captions.extend(descriptions[key])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

def create_sequences(tokenizer, max_length, descriptions, image_features):
    X1, X2, y = [], [], []
    for img_id, captions in descriptions.items():
        for caption in captions:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(image_features[img_id])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

max_length = max(len(c.split()) for c in all_captions)
X1, X2, y = create_sequences(tokenizer, max_length, descriptions, image_features)
X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(X1, X2, y, test_size=0.2, random_state=42)

In [None]:
def build_model(vocab_size, max_length):
    # Feature extractor (CNN)
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # Sequence processor (RNN)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Decoder (Merge)
    decoder1 = Add()([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = build_model(vocab_size, max_length)
model.summary()

In [None]:
# Training the model
epochs = 20
batch_size = 64

history = model.fit(
    [X1_train, X2_train], y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=([X1_test, X2_test], y_test),
    verbose=1
)

In [None]:
# Save the model and tokenizer
model.save('image_captioning_model.h5')
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
def generate_caption(model, tokenizer, image_feature, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([np.expand_dims(image_feature, axis=0), sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

# Test on a new image
test_image_feature = preprocess_images('path_to_test_image')[test_image_id]
caption = generate_caption(model, tokenizer, test_image_feature, max_length)
print("Generated Caption:", caption)