CS5100 Foundations of Artificial Intelligence
Project
Author: Arun Madhusudhanan, Tejaswini Dilip Deore

This script is used to do inference on test dataset using trained ViT-GPT2 model

In [None]:
# Install required libraries
!pip install datasets
!pip install transformers==4.17
!pip install evaluate
!pip install rouge_score
!pip install accelerate -U

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ln -s /content/drive/My\ Drive/ /mydrive

In [None]:
# copy the dataset stored in google drive to current directory
!cp -r /content/drive/MyDrive/Image_captioning/Images.zip /content/


In [None]:
# copy the dataset stored in google drive to current directory
!cp -r /content/drive/MyDrive/Image_captioning/captions.txt.zip /content/

In [None]:
# unzip the image data
!unzip /content/Images.zip -d /content/Images

In [None]:
# unzip the captions file
!unzip /content/captions.txt.zip -d /content/

In [None]:
# copy the fine tuned ViT-GPT2 model from google drive to current directory
!cp -r /content/drive/MyDrive/image-captioning-output/ /content/image-captioning-output/

In [None]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import matplotlib.image as mpimg
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor,AutoTokenizer

In [None]:
# Create a pandas data frame to handle dataset
image_data_location = "Images"
caption_data_location = "captions.txt"

df = pd.read_csv(caption_data_location)
# generate a new df with image and caption but image only represented once and adjust index
df = df.drop_duplicates(subset='image')
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting
2,1002674143_1b742ab4b8.jpg,A little girl covered in paint sits in front o...
3,1003163366_44323f5815.jpg,A man lays on a bench while his dog sits by him .
4,1007129816_e794419615.jpg,A man in an orange hat starring at something .


In [None]:
# Define untility functions for creating dataset

from PIL import Image

def tokenization_fn(captions,max_target_length):
    labels = tokenizer(captions, padding="max_length", max_length=max_target_length).input_ids
    return labels

def feature_extraction_fn(images):
    return feature_extractor(images, return_tensors="np").pixel_values

def preprocess_fn(images, captions, max_target_length, image_dir=image_data_location):
    image_path = image_dir + "/" + images
    img = Image.open(image_path).convert("RGB")
    return {"labels": tokenization_fn(captions,max_target_length), "pixel_values": feature_extraction_fn(img)}

In [None]:
# create a test datasset for evaluation
from datasets import Dataset
import torch
import numpy
from tqdm import trange


train_size = int(0.75 * len(df))
train_df = df[:train_size]
test_df = df[train_size:]

def train_data_generator():
    for i in range(4):
        model_inputs = preprocess_fn(train_df["image"][i], train_df["caption"][i], 40)
        yield {"labels": model_inputs["labels"], "pixel_values": model_inputs["pixel_values"].squeeze()}

def test_data_generator():
    for i in range(train_size, len(df)):
    # for i in range(1000, 1100):
        model_inputs = preprocess_fn(test_df["image"][i], test_df["caption"][i], 40)
        yield {"labels": model_inputs["labels"], "pixel_values": model_inputs["pixel_values"].squeeze()}

train_dataset = Dataset.from_generator(train_data_generator)
test_dataset = Dataset.from_generator(test_data_generator)

print(test_dataset)

In [None]:
# inferencing on an image from test dataset

import requests
from PIL import Image


# load a fine-tuned image captioning model and corresponding tokenizer and image processor. The files were stored in my google drive.
model_2 = VisionEncoderDecoderModel.from_pretrained("./image-captioning-output")
tokenizer = AutoTokenizer.from_pretrained("./image-captioning-output")
feature_extractor = AutoFeatureExtractor.from_pretrained("./image-captioning-output")

image_path = image_data_location + "/" + df["image"][6800]
img = Image.open(image_path).convert("RGB")
plt.imshow(mpimg.imread(image_path))

pixel_values = feature_extractor(img, return_tensors="pt").pixel_values

# autoregressively generate caption (uses greedy decoding by default)
generated_ids = model_2.generate(pixel_values)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

In [None]:
# Execute this line to find the evaluation metric scores

import evaluate
from tqdm import trange

from cider import Cider
# Create a Cider object
cider = Cider()

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")

# go through the test dataset and generate predictions and compute metrics

bleu_1_scores = []
bleu_2_scores = []
rouge_1_scores = []
rouge_L_scores = []
meteor_scores = []
cider_scores = []

for i in trange(train_size, len(df)):
    predictions = []
    references = []
    cider_predictions = {'image1': ['This is a dummy data for cider.py to work']}
    cider_references = {'image1': ['This is a dummy data for cider.py to work']}


    img = Image.open(image_data_location + "/" + test_df["image"][i]).convert("RGB")
    pixel_values = feature_extractor(img, return_tensors="pt").pixel_values
    generated_ids = model_2.generate(pixel_values)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    predictions.append(generated_text)
    references.append([test_df["caption"][i]])
    cider_predictions[test_df["image"][i]] = [generated_text]
    cider_references[test_df["image"][i]] = [test_df["caption"][i]]

    bleu = bleu_metric.compute(predictions=predictions, references=references)
    rouge = rouge_metric.compute(predictions=predictions, references=references)
    meteor = meteor_metric.compute(predictions=predictions, references=references)
    _, scores = cider.compute_score(cider_predictions, cider_references)

    bleu_1_scores.append(bleu["precisions"][0])
    bleu_2_scores.append(bleu["precisions"][1])

    rouge_1_scores.append(rouge["rouge1"])
    rouge_L_scores.append(rouge["rougeL"])

    meteor_scores.append(meteor['meteor'])

    cider_scores.append(scores[1])

print("BLEU-1: ", sum(bleu_1_scores)/len(bleu_1_scores))
print("BLEU-2: ", sum(bleu_2_scores)/len(bleu_2_scores))
print("ROUGE-1: ", sum(rouge_1_scores)/len(rouge_1_scores))
print("ROUGE-L: ", sum(rouge_L_scores)/len(rouge_L_scores))
print("METEOR: ", sum(meteor_scores)/len(meteor_scores))
print("CIDER: ", sum(cider_scores)/len(cider_scores))