In [2]:
from tensorflow.keras.models import load_model

In [3]:
import numpy as np
import pandas as pd
import os
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.densenet import DenseNet201
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
import pickle

# Load the model
caption_model = load_model(r"C:\Users\08bur\OneDrive\Desktop\submission-2\saved_model.keras")

# Load the tokenizer
with open(r"C:\Users\08bur\OneDrive\Desktop\submission-2\tokenizer_2.pkl", 'rb') as f:
    tokenizer = pickle.load(f)

max_length = 35  # Define your max_length here

# Function to extract image features using DenseNet
def extract_image_features(image_path, model):
    img = load_img(image_path, target_size=(224, 224))  # Resize image to match model input
    img = img_to_array(img) / 255.0  # Normalize the image
    img = np.expand_dims(img, axis=0)  # Expand dimensions to fit model input
    feature = model.predict(img, verbose=0)  # Get features from the model
    return feature

# Function to convert integer back to word using the tokenizer
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# Function to predict caption for a given image
def predict_caption(image_path):
    # Load the DenseNet model for feature extraction
    densenet_model = DenseNet201()
    feature_extraction_model = Model(inputs=densenet_model.input, outputs=densenet_model.layers[-2].output)
    
    # Extract features from the image
    feature = extract_image_features(image_path, feature_extraction_model)

    # Initialize the caption generation
    in_text = "startseq"
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)  # Note: changed to 'maxlen'

        # Predict the next word
        y_pred = caption_model.predict([feature, sequence])
        y_pred = np.argmax(y_pred)

        # Convert the predicted integer to a word
        word = idx_to_word(y_pred, tokenizer)

        if word is None:
            break
            
        in_text += " " + word
        
        if word == 'endseq':
            break
            
    return in_text.replace("startseq", "").replace("endseq", "").strip()  # Clean the output caption

# Example usage
image_path = r'C:\Users\08bur\OneDrive\Desktop\Prof Jim 2\input\flickr8k\Images\130211457_be3f6b335d.jpg'  # Replace with your image path
predicted_caption = predict_caption(image_path)
print(predicted_caption)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 281ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
happy woman in red dress is standing in front of the camera
