In [1]:
import os
import numpy as np
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.applications import InceptionV3
from sklearn.model_selection import train_test_split
import pickle
import string

In [2]:
# Paths to dataset and model files
image_dir = r"C:\Users\ahmed\Documents\Python Scripts\Image caption generator\Flickr8k_Dataset\Images"
caption_file = r"C:\Users\ahmed\Documents\Python Scripts\Image caption generator\Flickr8k_Dataset\captions.txt"

# Load and process captions
def load_captions(filename):
    descriptions = {}
    with open(filename, 'r') as file:
        captions = file.readlines()
    for line in captions[1:]:  # Skip header row
        tokens = line.strip().split(",")
        if len(tokens) < 2:
            continue
        image_id, caption = tokens[0], tokens[1].lower()
        caption = caption.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        if image_id not in descriptions:
            descriptions[image_id] = []
        descriptions[image_id].append("startseq " + caption + " endseq")
    return descriptions

descriptions = load_captions(caption_file)

In [3]:
# Feature extraction using InceptionV3
def preprocess_image(image_path):
    img = load_img(image_path, target_size=(299, 299))
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img

model_inception = InceptionV3(weights='imagenet')
model_inception = Model(model_inception.input, model_inception.layers[-2].output)

# Updated function to extract features with checkpoints
def extract_features(directory, batch_size=50, checkpoint_file='features_checkpoint.pkl'):
    # Try loading a checkpoint if it exists
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'rb') as file:
            features = pickle.load(file)
    else:
        features = {}

    # Process images in the directory
    images_processed = len(features)
    img_names = os.listdir(directory)
    total_images = len(img_names)

    for idx, img_name in enumerate(img_names):
        if img_name in features:
            continue  # Skip if already processed
        
        img_path = os.path.join(directory, img_name)
        img = preprocess_image(img_path)
        feature = model_inception.predict(img, verbose=0)
        features[img_name] = feature

        # Save checkpoint every `batch_size` images
        if (idx + 1) % batch_size == 0:
            with open(checkpoint_file, 'wb') as file:
                pickle.dump(features, file)
            print(f"Checkpoint saved at {idx + 1} / {total_images} images processed.")

    # Final save after all images
    with open(checkpoint_file, 'wb') as file:
        pickle.dump(features, file)
    print("Feature extraction complete and saved.")

    return features

# Run feature extraction
features = extract_features(image_dir)

Checkpoint saved at 200 / 8091 images processed.
Checkpoint saved at 250 / 8091 images processed.
Checkpoint saved at 300 / 8091 images processed.
Checkpoint saved at 350 / 8091 images processed.
Checkpoint saved at 400 / 8091 images processed.
Checkpoint saved at 450 / 8091 images processed.
Checkpoint saved at 500 / 8091 images processed.
Checkpoint saved at 550 / 8091 images processed.
Checkpoint saved at 600 / 8091 images processed.
Checkpoint saved at 650 / 8091 images processed.
Checkpoint saved at 700 / 8091 images processed.
Checkpoint saved at 750 / 8091 images processed.
Checkpoint saved at 800 / 8091 images processed.
Checkpoint saved at 850 / 8091 images processed.
Checkpoint saved at 900 / 8091 images processed.
Checkpoint saved at 950 / 8091 images processed.
Checkpoint saved at 1000 / 8091 images processed.
Checkpoint saved at 1050 / 8091 images processed.
Checkpoint saved at 1100 / 8091 images processed.
Checkpoint saved at 1150 / 8091 images processed.
Checkpoint saved

In [5]:
# Prepare data for training
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Combine all captions into a single list for tokenizer fitting
def combine_captions(descriptions):
    all_captions = []
    for key, captions_list in descriptions.items():
        all_captions.extend(captions_list)
    return all_captions

# Step 1: Prepare tokenizer
all_captions = combine_captions(descriptions)  # Combine captions from the loaded descriptions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

# Step 2: Define maximum sequence length
max_length = max(len(caption.split()) for caption in all_captions)

# Step 3: Create sequences
def create_sequences(tokenizer, max_length, descriptions, features):
    X1, X2, y = [], [], []
    vocab_size = len(tokenizer.word_index) + 1  # Including padding (0)
    
    for key, captions_list in descriptions.items():
        feature = features[key][0]  # Extract the feature vector for the image
        for caption in captions_list:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(feature)
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

# Now you can call `create_sequences`
X1, X2, y = create_sequences(tokenizer, max_length, descriptions, features)

In [8]:
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in all_captions)

# Build the model
from tensorflow.keras.layers import Input, Dropout, Dense, Embedding, LSTM, Add
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    decoder1 = Add()([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

# Define and compile the model with predefined vocab_size and max_length
model = define_model(vocab_size, max_length)
model.compile(loss='categorical_crossentropy', optimizer=Adam())


In [None]:
# Train the model
epochs = 20
batch_size = 64
steps = len(X1) // batch_size
model.fit([X1, X2], y, epochs=epochs, steps_per_epoch=steps, verbose=1)

In [None]:
# Save model and tokenizer
model.save("image_caption_model.h5")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
# Caption generation
def generate_caption(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace('startseq', '').replace('endseq', '').strip()

In [None]:
# Load model and tokenizer for prediction
def load_model_and_predict(image_path):
    model = load_model("image_caption_model.h5")
    with open("tokenizer.pkl", "rb") as f:
        tokenizer = pickle.load(f)
    max_length = 34  # Set this to the max length used during training
    photo = preprocess_image(image_path)
    feature = model_inception.predict(photo, verbose=0)
    caption = generate_caption(model, tokenizer, feature, max_length)
    return caption

In [None]:
# Example usage: Generate caption for a new image
image_path = r"C:\path\to\your\external\image.jpg"
print("Generated Caption:", load_model_and_predict(image_path))