In [None]:
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import tensorflow as tf

from pickle import load
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.models import load_model


# ─── In a notebook: prompt for image path ────────────────────────────────────────
img_path = "C:\\Users\\abk4t\\AI Project - Image Caption Generator\\Flickr8k_Dataset\\Flicker8k_Dataset\\53043785_c468d6f931.jpg"


def extract_features(filename, model):
    """
    Load an image, preprocess it for Xception, and extract features.
    Returns a (1, 2048) feature vector or None on error.
    """
    try:
        image = Image.open(filename)
    except:
        print("ERROR: Couldn't open image! Make sure the image path and extension are correct")
        return None

    image = image.resize((299, 299))
    image = np.array(image)

    if image.ndim == 3 and image.shape[2] == 4:
        image = image[..., :3]

    image = np.expand_dims(image, axis=0).astype('float16')
    image = image / 127.5
    image = image - 1.0

    # Run Xception on whichever device is available (GPU/DirectML or CPU)
    feature = model.predict(image)
    return feature


def word_for_id(integer, tokenizer):
    """
    Map an integer (word index) back to its word string via the tokenizer.
    Returns None if not found.
    """
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None


def generate_desc(model, tokenizer, photo, max_length):
    """
    Generate a caption for a given photo feature vector using the trained model.
    Force the LSTM to run on CPU so it does not attempt a cuDNN kernel.
    """
    in_text = 'start'
    description = in_text

    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([description])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)

        # Force the captioning model's predict() onto CPU
        with tf.device('/CPU:0'):
            yhat = model.predict([photo, sequence], verbose=0)

        yhat = np.argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        description += ' ' + word
        if word == 'end':
            break

    return description


# ─── Load tokenizer, captioning model, and Xception feature extractor ─────────
max_length = 32

tokenizer = load(open("tokenizer.p", "rb"))
caption_model = load_model('models/best_model_9.h5')

# Xception can run on GPU/DirectML if available; no cuDNN involved here
xception_model = Xception(include_top=False, pooling="avg")


# ─── Extract features from the input image ──────────────────────────────────────
photo = extract_features(img_path, xception_model)
if photo is None:
    raise SystemExit("Failed to extract features from “%s”" % img_path)

# ─── Open the image for display ─────────────────────────────────────────────────
img = Image.open(img_path)

# ─── Generate and print the description ─────────────────────────────────────────
description = generate_desc(caption_model, tokenizer, photo, max_length)
print("\n\n" + description)

# ─── Display the image ─────────────────────────────────────────────────────────
plt.imshow(img)
plt.axis('off')
