## **Import Needed Libraries**

In [None]:
import cv2
import torch
from PIL import Image
import matplotlib.pyplot as plt
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

## **Load Model**

In [None]:
# Get Model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# Get Image Feature Extractor
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# Get Tokenizer Model
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Apply model on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# The maximum length the generated tokens can have
max_length = 16
#  Number of beams for beam search
num_beams = 4
# Generation Config
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

## **Prediction Function**

In [None]:
def predict_step(image_paths):
    images = []
    for image_path in image_paths:
        img = Image.open(image_path)
        if img.mode != "RGB":
            img = img.convert(mode="RGB")

        images.append(img)

    # Feature Extractor
    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    # Apply model
    output_ids = model.generate(pixel_values, **gen_kwargs)

    # Get text tokens
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

## **Test Model**

In [None]:
image = '/kaggle/input/test-image/image.jpg'
img = cv2.imread(image)
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()

In [None]:
predict_step([image])