In [None]:
import joblib
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Embedding, LSTM, Dot, Reshape, Concatenate, BatchNormalization, GlobalMaxPooling2D, Dropout, Add, MaxPooling2D, GRU, AveragePooling2D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import cv2
from nltk.translate.bleu_score import sentence_bleu

chexnet_weights = "brucechou1983_CheXNet_Keras_0.3.0_weights.h5"

# Model Architecture
def create_chexnet(chexnet_weights=chexnet_weights, input_size=(224,224)):
    model = tf.keras.applications.DenseNet121(include_top=False, input_shape=input_size+(3,))
    x = model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(14, activation="sigmoid", name="chexnet_output")(x)
    chexnet = tf.keras.Model(inputs=model.input, outputs=x)
    chexnet.load_weights(chexnet_weights)
    chexnet = tf.keras.Model(inputs=model.input, outputs=chexnet.layers[-3].output)
    return chexnet

class ImageEncoder(tf.keras.layers.Layer):
    def __init__(self, name="image_encoder_block"):
        super().__init__()
        self.chexnet = create_chexnet(input_size=(224,224))
        self.chexnet.trainable = False
        self.avgpool = AveragePooling2D(pool_size=(2, 2))

    def call(self, data):
        op = self.chexnet(data)
        op = self.avgpool(op)
        op = tf.reshape(op, shape=(-1, op.shape[1]*op.shape[2], op.shape[3]))
        return op

def encoder(image1, image2, dense_dim, dropout_rate):
    im_encoder = ImageEncoder()
    bkfeat1 = im_encoder(image1)
    bkfeat2 = im_encoder(image2)
    bk_dense = Dense(dense_dim, name='bkdense', activation='relu')
    bkfeat1 = bk_dense(bkfeat1)
    bkfeat2 = bk_dense(bkfeat2)
    concat = Concatenate(axis=1)([bkfeat1, bkfeat2])
    bn = BatchNormalization(name="encoder_batch_norm")(concat)
    dropout = Dropout(dropout_rate, name="encoder_dropout")(bn)
    return dropout

class GlobalAttention(tf.keras.layers.Layer):
    def __init__(self, dense_dim):
        super().__init__()
        self.W1 = Dense(units=dense_dim)
        self.W2 = Dense(units=dense_dim)
        self.V = Dense(units=1)

    def call(self, encoder_output, decoder_h):
        decoder_h = tf.expand_dims(decoder_h, axis=1)
        tanh_input = self.W1(encoder_output) + self.W2(decoder_h)
        tanh_output = tf.nn.tanh(tanh_input)
        attention_weights = tf.nn.softmax(self.V(tanh_output), axis=1)
        context_vector = tf.reduce_sum(attention_weights * encoder_output, axis=1)
        return context_vector, attention_weights

class OneStepDecoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, max_pad, dense_dim, name="onestepdecoder"):
        super().__init__()
        self.dense_dim = dense_dim
        self.embedding = Embedding(input_dim=vocab_size+1, output_dim=embedding_dim, input_length=max_pad, mask_zero=True, name='onestepdecoder_embedding')
        self.LSTM = GRU(units=self.dense_dim, return_state=True, name='onestepdecoder_LSTM')
        self.attention = GlobalAttention(dense_dim=dense_dim)
        self.dense = Dense(dense_dim, name='onestepdecoder_embedding_dense', activation='relu')
        self.final = Dense(vocab_size+1, activation='softmax')

    @tf.function
    def call(self, input_to_decoder, encoder_output, decoder_h):
        embedding_op = self.embedding(input_to_decoder)
        context_vector, attention_weights = self.attention(encoder_output, decoder_h)
        context_vector_time_axis = tf.expand_dims(context_vector, axis=1)
        concat_input = Concatenate(axis=-1)([context_vector_time_axis, embedding_op])
        output, decoder_h = self.LSTM(concat_input, initial_state=decoder_h)
        output = self.final(output)
        return output, decoder_h, attention_weights

class Decoder(tf.keras.Model):
    def __init__(self, max_pad, embedding_dim, dense_dim, batch_size, vocab_size):
        super().__init__()
        self.onestepdecoder = OneStepDecoder(vocab_size=vocab_size, embedding_dim=embedding_dim, max_pad=max_pad, dense_dim=dense_dim)
        self.max_pad = max_pad
        self.batch_size = batch_size
        self.dense_dim = dense_dim

    @tf.function
    def call(self, encoder_output, caption):
        decoder_h = tf.zeros_like(encoder_output[:,0])
        output_array = tf.TensorArray(tf.float32, size=self.max_pad)
        for timestep in range(self.max_pad):
            output, decoder_h, _ = self.onestepdecoder(caption[:,timestep:timestep+1], encoder_output, decoder_h)
            output_array = output_array.write(timestep, output)
        output_array = tf.transpose(output_array.stack(), [1, 0, 2])
        return output_array

# Model Creation and Prediction Functions
def create_model():
    input_size = (224,224)
    tokenizer = joblib.load('tokenizer.pkl')
    max_pad = 29
    batch_size = 100
    vocab_size = len(tokenizer.word_index)
    embedding_dim = 300
    dense_dim = 512
    dropout_rate = 0.2

    tf.keras.backend.clear_session()
    image1 = Input(shape=input_size + (3,))
    image2 = Input(shape=input_size + (3,))
    caption = Input(shape=(max_pad,))

    encoder_output = encoder(image1, image2, dense_dim, dropout_rate)
    output = Decoder(max_pad, embedding_dim, dense_dim, batch_size, vocab_size)(encoder_output, caption)
    model = tf.keras.Model(inputs=[image1, image2, caption], outputs=output)
    model.load_weights('Encoder_Decoder_global_attention.h5')
    return model, tokenizer

def greedy_search_predict(image1, image2, model, tokenizer, input_size=(224,224)):
    image1 = tf.expand_dims(cv2.resize(image1, input_size, interpolation=cv2.INTER_NEAREST), axis=0)
    image2 = tf.expand_dims(cv2.resize(image2, input_size, interpolation=cv2.INTER_NEAREST), axis=0)
    image1 = model.get_layer('image_encoder')(image1)
    image2 = model.get_layer('image_encoder')(image2)
    image1 = model.get_layer('bkdense')(image1)
    image2 = model.get_layer('bkdense')(image2)
    concat = model.get_layer('concatenate')([image1, image2])
    enc_op = model.get_layer('encoder_batch_norm')(concat)
    enc_op = model.get_layer('encoder_dropout')(enc_op)

    decoder_h = tf.zeros_like(enc_op[:,0])
    a = []
    max_pad = 29
    for i in range(max_pad):
        if i == 0:
            caption = np.array(tokenizer.texts_to_sequences(['<cls>']))
        output, decoder_h, _ = model.get_layer('decoder').onestepdecoder(caption, enc_op, decoder_h)
        max_prob = tf.argmax(output, axis=-1)
        caption = np.array([max_prob])
        if max_prob == np.squeeze(tokenizer.texts_to_sequences(['<end>'])):
            break
        else:
            a.append(tf.squeeze(max_prob).numpy())
    return tokenizer.sequences_to_texts([a])[0]

def get_bleu(reference, prediction):
    reference = [reference.split()]
    prediction = prediction.split()
    bleu1 = sentence_bleu(reference, prediction, weights=(1,0,0,0))
    bleu2 = sentence_bleu(reference, prediction, weights=(0.5,0.5,0,0))
    bleu3 = sentence_bleu(reference, prediction, weights=(0.33,0.33,0.33,0))
    bleu4 = sentence_bleu(reference, prediction, weights=(0.25,0.25,0.25,0.25))
    return bleu1, bleu2, bleu3, bleu4

# Utility Functions
def predict1(image1, image2=None, model_tokenizer=None):
    if image2 is None:
        image2 = image1
    if model_tokenizer is None:
        model, tokenizer = create_model()
    else:
        model, tokenizer = model_tokenizer
    return greedy_search_predict(image1, image2, model, tokenizer)

def predict2(true_caption, image1, image2=None, model_tokenizer=None):
    if image2 is None:
        image2 = image1
    if model_tokenizer is None:
        model, tokenizer = create_model()
    else:
        model, tokenizer = model_tokenizer
    predicted_caption = greedy_search_predict(image1, image2, model, tokenizer)
    scores = get_bleu(true_caption, predicted_caption)
    return pd.DataFrame([scores], columns=['bleu1','bleu2','bleu3','bleu4'])

def function1(image1_list, image2_list, model_tokenizer=None):
    if model_tokenizer is None:
        model_tokenizer = list(create_model())
    predicted_captions = []
    for i1, i2 in zip(image1_list, image2_list):
        img1 = cv2.imread(i1, cv2.IMREAD_UNCHANGED)
        img2 = cv2.imread(i2, cv2.IMREAD_UNCHANGED)
        if img1 is not None and img2 is not None:
            img1 = img1 / 255.0
            img2 = img2 / 255.0
            caption = predict1(img1, img2, model_tokenizer)
            predicted_captions.append(caption)
        else:
            print(f"Error loading images: {i1}, {i2}")
            predicted_captions.append("")
    return predicted_captions

def function2(true_captions, image1_list, image2_list):
    model_tokenizer = list(create_model())
    predicted = pd.DataFrame(columns=['bleu1','bleu2','bleu3','bleu4'])
    for c, i1, i2 in zip(true_captions, image1_list, image2_list):
        img1 = cv2.imread(i1, cv2.IMREAD_UNCHANGED)
        img2 = cv2.imread(i2, cv2.IMREAD_UNCHANGED)
        if img1 is not None and img2 is not None:
            img1 = img1 / 255.0
            img2 = img2 / 255.0
            caption = predict2(c, img1, img2, model_tokenizer)
            predicted = pd.concat([predicted, caption], ignore_index=True)
        else:
            print(f"Error loading images: {i1}, {i2}")
    return predicted

# Main Execution
if __name__ == "__main__":
    print("Loading model and tokenizer...")
    model, tokenizer = create_model()
    model_tokenizer = [model, tokenizer]

    image1_path = 'image1.png'
    image2_path = 'image2.png'

    image1 = cv2.imread(image1_path, cv2.IMREAD_UNCHANGED)
    image2 = cv2.imread(image2_path, cv2.IMREAD_UNCHANGED)

    if image1 is None or image2 is None:
        print("Error: One or both images not found. Check paths.")
    else:
        image1 = image1 / 255.0
        image2 = image2 / 255.0

        print("Generating caption for the pair of images...")
        caption = predict1(image1, image2, model_tokenizer)
        print(f"Generated Caption: {caption}")

        image1_list = [image1_path, 'image3.png']
        image2_list = [image2_path, 'image4.png']
        print("\nGenerating captions for a list of image pairs...")
        captions = function1(image1_list, image2_list, model_tokenizer)
        for i, cap in enumerate(captions):
            print(f"Pair {i+1}: {cap}")

        true_captions = ["no acute cardiopulmonary findings", "probably scarring in the left upper lobes"]
        print("\nComputing BLEU scores for a list of image pairs...")
        bleu_scores = function2(true_captions, image1_list, image2_list)
        print(bleu_scores)