# Image Caption Generation using ViT-GPT2 image captioning model


#### Installing packages

In [None]:
# installing transformers
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
#installing pyngrok
!pip install pyngrok==4.1.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyngrok==4.1.1
  Downloading pyngrok-4.1.1.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
  Created wheel for pyngrok: filename=pyngrok-4.1.1-py3-none-any.whl size=15979 sha256=41205ad7245b32bbd3ff8dab97e7793553ab8a4706219ee350a7e6c3ca201aed
  Stored in directory: /root/.cache/pip/wheels/89/2d/c2/abe6bcfde6bce368c00ecd73310c11edb672c3eda09a090cfa
Successfully built pyngrok
Installing collected packages: pyngrok
Successfully installed pyngrok-4.1.1


In [None]:
#installing streamlit
!pip install streamlit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting streamlit
  Downloading streamlit-1.21.0-py2.py3-none-any.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
Collecting validators>=0.2
  Downloading validators-0.20.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting watchdog
  Downloading watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting blinker>=1.0.0
  Downloading blinker-1.6.2-py3-none-any.whl (13 kB)
Collecting pympler>=0.9
  Downloading Pympler-1.0.1-py3-none-any.whl (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.8/164.8 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19
  Downloading GitPython-3.1.31-py3-none-any.whl

### Creating app for image captioning

In [None]:
%%writefile app.py

#importing necessary libraries
import streamlit as st
import tensorflow as tf


st.set_option('deprecation.showfileUploaderEncoding', False)

st.write("""# Instant Image Caption Generator""")

#creating file uploader
file = st.file_uploader("Please upload the image to generate the caption", type = ["jpg", "png"])
 
#creating if-else function to perform the task for captioning. If file is not uploaded the text "Please upload an image file." 
#will be displayed and when the image gets uploaded the captioning task will be performed
if file is None:
    st.text("Please upload an image file.")
else:
    #opens and dispays the uploaded image
    from PIL import Image
    image = Image.open(file)
    st.image(image, use_column_width=True)
    
    #importing necessary libraries to perform image captioning
    from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
    import torch
    import cv2
    from PIL import Image

    #downloading the pretrained model from hugging face.
    ptmodel = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") #downloading model
    feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")  #downloading feature extractor
    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")  #downloading tokenizer

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    ptmodel.to(device)

    #defining the constansts for kwargs
    max_length = 16
    num_beams = 4
    gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
    
    #takes images
    images = []
    i_image = Image.open(file)
    #if the image is not in RGB format this code converts the image to RGB  
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")
    images.append(i_image)

    #generating pixel values (feature extraction)
    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    #generating output ids
    output_ids = ptmodel.generate(pixel_values, **gen_kwargs)

    #prediction
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]

    #dispaying the caption
    string = "The caption for the image is : "+preds.__str__()
    st.success(string)


Writing app.py


In [None]:
#taking ngrok authentication
!ngrok authtoken 2OtxalV6j9VCDLCzkUX7nKtRvP6_7YykEHpNXzqTticCpq37v

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [None]:
#running the app
!nohup streamlit run app.py &

nohup: appending output to 'nohup.out'


In [None]:
#connecting to the website
from pyngrok import ngrok
url = ngrok.connect(port=8501)
url

'http://25d4-34-87-98-56.ngrok-free.app'

In [None]:
# show the error if there are any
!cat /content/nohup.out