<a href="https://colab.research.google.com/github/abhirammohanmadhav/Image-Caption-Verifier/blob/main/ImageCaptionVerifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install transformers
!pip install diffusers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3
Looking in indexes: https://pypi.org/simple, https://u

In [6]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image
from diffusers import StableDiffusionPipeline
import os
from skimage.metrics import structural_similarity, mean_squared_error
import cv2

def predict_caption(image_paths, model_type):
    """
    This function is used to generate the caption from images. Here a gpt2-image-captioning
    model is used, taken from huggingface.
    Args:
            image_paths: The path to the input image.
            model_type: The type of model to be used for caption generation
    """
    model = VisionEncoderDecoderModel.from_pretrained(model_type)
    feature_extractor = ViTImageProcessor.from_pretrained(model_type)
    tokenizer = AutoTokenizer.from_pretrained(model_type)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    max_length = 16
    num_beams = 4
    gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

    images = []
    """
    Here PIL.Image library is used instead ofOpenCV. If OpenCV is used, need to convert 
    from BGR to RGB.
    """
    i_image = Image.open(image_paths) 
    if i_image.mode != "RGB":
        i_image = i_image.convert(mode="RGB")

    images.append(i_image)
    print("input image size: ", images[0].size)

    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return images, preds

def generate_image(images, preds, model, output_path, out_img_name):  
    """
    This function is used to generated the image from the caption already generated by 
    predict_caption() function. Here a stable diffusion model v1.5 is used.
    Args:
            images: The list of images from predict_caption() function
            preds: The captions generated for the images.
            model: The type of model to be used for text-to-image generation
            output_path: Root path to store the generated images
            out_img_name: Name of the image to be stored in
    """
    pipe = StableDiffusionPipeline.from_pretrained(model, torch_dtype=torch.float16)
    pipe = pipe.to("cuda")
    image = pipe(preds).images[0]  # captions used from the previous function

    final_path = os.path.join(output_path, out_img_name)
    new_size = images[0].size
    image.resize(new_size)
    print("output image size: ", image.size)
    image.save(final_path)

def similarity_value(ref_image, gen_image, width, height):
    """
    This function is to compare how similar the reference and the generated images are.
    Args:
            ref_image: The path to the reference image.
            gen_image: The path to the generated image.
            width, height: The width and height of the image to be resized to. Both the images 
                                    should be of same dimensions.
    """
    ref_img = cv2.imread(ref_image)
    gen_img = cv2.imread(gen_image)    

    resized_inp_img = cv2.resize(ref_img, (width, height))
    resized_gen_img = cv2.resize(gen_img, (width, height))

    # For SSIM, the images shouls be in gray scale
    ref_img_gray = cv2.cvtColor(resized_inp_img, cv2.COLOR_BGR2GRAY)
    gen_img_gray = cv2.cvtColor(resized_gen_img, cv2.COLOR_BGR2GRAY)

    #SSIM score
    ssim_score, ssim_image = structural_similarity(ref_img_gray, gen_img_gray, full=True)

    #MSE 
    mse = mean_squared_error(resized_inp_img, resized_gen_img)
  
    print('SSIM score:', ssim_score)
    print('MSE score:', mse)
    return ssim_score, mse

if __name__ == "__main__":
    """
    Can generated a config.py python script for larger number of inputs and more complex 
    functions.
    """

    model_type = 'nlpconnect/vit-gpt2-image-captioning' # caption generator
    model_id = 'runwayml/stable-diffusion-v1-5'# image generator
    root_path = f'/content/input_images' # root path for the input image
    image_name = f'image_2.jpg'
    output_path = f'/content'# root path for the output image
    width = 512
    height = 512

    # Here same image name is used for both reference and output images
    final_path = os.path.join(output_path, image_name)
    complete_path = os.path.join(root_path, image_name)

    image, caption = predict_caption(complete_path, model_type)
    generate_image(image, caption, model_id, output_path, image_name)
    ssim, mse = similarity_value(complete_path, final_path)


Downloading (…)lve/main/config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

input image size:  (612, 402)


Downloading (…)ain/model_index.json:   0%|          | 0.00/543 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

Downloading pytorch_model.bin:   0%|          | 0.00/492M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading (…)_encoder/config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

Downloading (…)cheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)_checker/config.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

Downloading (…)on_pytorch_model.bin:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading (…)819/unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading (…)d819/vae/config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading (…)on_pytorch_model.bin:   0%|          | 0.00/335M [00:00<?, ?B/s]

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


  0%|          | 0/50 [00:00<?, ?it/s]

output image size:  (512, 512)
SSIM score: 0.19060797904890878
MSE score: 9005.719446818033
