<a href="https://colab.research.google.com/github/ansar2019/image-captionig/blob/main/VITGPT2_Image_Captioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Installing Transformer Libraries
!pip install transformers



In [2]:
# Web links Handler
import requests

# Backend
import torch

# Image Processing
from PIL import Image

# Transformer and Pretrained Model
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2TokenizerFast

# Managing loading processsing
from tqdm import tqdm

# Assign available GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# Loading a fine-tuned image captioning Transformer Model

# ViT Encoder - Decoder Model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)

# Corresponding ViT Tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Image processor
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

In [4]:
# Accesssing images from the web
import urllib.parse as parse
import os
# Verify url
def check_url(string):
    try:
        result = parse.urlparse(string)
        return all([result.scheme, result.netloc, result.path])
    except:
        return False

# Load an image
def load_image(image_path):
    if check_url(image_path):
        return Image.open(requests.get(image_path, stream=True).raw)
    elif os.path.exists(image_path):
        return Image.open(image_path)


In [5]:
# Image inference
def get_caption(model, image_processor, tokenizer, image_path):
    image = load_image(image_path)

    # Preprocessing the Image
    img = image_processor(image, return_tensors="pt").to(device)

    # Generating captions
    output = model.generate(**img)

    # decode the output
    caption = tokenizer.batch_decode(output, skip_special_tokens=True)[0]

    return caption

In [6]:
from google.colab import drive
drive.mount('/content/Drive')


Mounted at /content/Drive


In [7]:
import glob
filenames = glob.glob("/content/Drive/MyDrive/testdata/*.jpg")
filenames.sort()
for image in filenames:
 input_image = Image.open(image)
 caption = get_caption(model, image_processor, tokenizer, image)
 print(image,caption)
 with open('VITGPT2.txt', 'a') as fp:
  fp.write(str([image])+str([caption]))
  fp.write('\n')

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


/content/Drive/MyDrive/testdata/File 001.jpg a man in a suit and tie speaking to a crowd 




/content/Drive/MyDrive/testdata/File 002.jpg three children sitting at a table with a laptop 
/content/Drive/MyDrive/testdata/File 003.jpg three women sitting at a table with plates of food 
/content/Drive/MyDrive/testdata/File 004.jpg a baby sitting in a high chair eating carrots 
/content/Drive/MyDrive/testdata/File 005.jpg a man and a woman sitting on a couch 
/content/Drive/MyDrive/testdata/File 006.jpg people walking down a street holding umbrellas 
/content/Drive/MyDrive/testdata/File 007.jpg people sitting around a table 
/content/Drive/MyDrive/testdata/File 008.jpg a little girl standing in a kitchen with a spoon 
/content/Drive/MyDrive/testdata/File 009.jpg a small plane with people standing around it 
/content/Drive/MyDrive/testdata/File 010.jpg a woman is talking on a cell phone 
/content/Drive/MyDrive/testdata/File 011.jpg two women standing next to each other holding umbrellas 
/content/Drive/MyDrive/testdata/File 012.jpg a woman sitting at a desk with a laptop 
/content/D