In [1]:
# Installing and importing
!pip install sentence_transformers

from google.colab import drive
from sentence_transformers import SentenceTransformer, util
from PIL import Image
import glob
import torch
import pickle
import zipfile
import pandas as pd
import re
import random
from IPython.display import display
from IPython.display import Image as IPImage
import os
from tqdm.autonotebook import tqdm
torch.set_num_threads(4)
import urllib.request
from skimage import io
from google.colab import output
from google.colab import files

# Get Visual-Genome data set in /tmp folder
!wget -P /tmp http://visualgenome.org/static/data/dataset/image_data.json.zip 
!unzip /tmp/image_data.json.zip -d /tmp

###############################################################################################################
# Modify as per need to load result.csv obtained from Noun_phrase extraction as a Data Frame in variable result
drive.mount('/content/drive')
result = pd.read_csv('/content/drive/MyDrive/Vision and Language grounding/result.csv')
###############################################################################################################

output.clear()

In [2]:
# First, we load the respective CLIP model
model = SentenceTransformer('clip-ViT-B-32')

# Function to extract url and image_id from Visual-Genome data set of images
def extract_info(image):
    return (image['url'], image['image_id'])

# Extract image info from json files
import json
f = open('/tmp/image_data.json')
data = json.load(f)
image_info = list(map(extract_info, data))
image_link = [element[0] for element in image_info] # List to store all image links
image_id = [element[1] for element in image_info] # List to store corresponding image ids

# Finding image embeddings
# Using 25000 random images for now
num_images = 25000
img_emb = model.encode([Image.fromarray(io.imread(URL)) for URL in random.sample(image_link, num_images)], batch_size=128, convert_to_tensor=True, show_progress_bar=True)

output.clear()

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/605M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/604 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/961k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


KeyboardInterrupt: ignored

In [None]:
type(img_emb)
torch.save(img_emb, 'tensor.pt')

In [None]:
# Define a serch function to search for top k images related to query
def search(query, k=5):
    # First, we encode the query (which can either be an image or a text string)
    query_emb = model.encode([query], convert_to_tensor=True, show_progress_bar=False)
    
    # Then, we use the util.semantic_search function, which computes the cosine-similarity
    # between the query embedding and all image embeddings.
    # It then returns the top_k highest ranked images, which we output
    hits = util.semantic_search(query_emb, img_emb, top_k=k)[0]
    
    image_id_list = [] # List to store the image id for the top k matches to query
    image_link_list = [] # List to store the image links for the top k matches to query

    # Iterate through top k matches and append to above lists
    for hit in hits:
        image_id_list.append(image_id[hit['corpus_id']])
        image_link_list.append(image_link[hit['corpus_id']])
    
    # Return the lists for that particular query
    return image_id_list, image_link_list

In [None]:
# List of noun phrases for each sentence
noun_phrase = result['Noun_phrase'].values

# Function to process the noun_phrases in result and return a list of noun_phrases
def process_nouns(noun_phrases):
    noun_phrases = noun_phrases[1:-1]
    noun_phrase_list = re.split(', ',noun_phrases)
    noun_phrase_list = list(map(lambda x: x[1:-1],noun_phrase_list))
    return noun_phrase_list

# List of noun phrases for each sentence after processing
l = list(map(process_nouns, noun_phrase))
output.clear()


In [None]:
# List to store the top k image matches for each noun of all sentence
similarity_list = []

# Iterate through noun-phrase list for a sentence
for noun_list in l:

    # List to store the top k image matches for one sentence
    similarity_list_sentence= []

    # Iterate through each noun in th list of noun-phrase
    for noun in noun_list:

        # Dict to store url and id for top k matches for all nouns in a sentence
        similarity_dict = {}

        # Dict to store url and ud for one noun in a sentence
        image_dict = {}

        # Get the list of image ids and urls for a noun
        image_id_list, image_link_list = search(noun)

        image_dict = dict(zip(image_id_list, image_link_list))
        similarity_dict['noun'] = noun
        similarity_dict['img'] = image_dict
        similarity_list_sentence.append(similarity_dict)
        
    similarity_list.append(similarity_list_sentence)

In [None]:
# Lets search for the images of a given noun in a given sentence using similariy_list
# index of sentence to search
s_idx = 0

print('Dict of noun_phrases in the required sentence is:')
display(similarity_list[s_idx])

In [None]:
# index of noun to search 
n_idx = 2
print('Top k matches for noun "{}" are:'.format(similarity_list[s_idx][n_idx]['noun']))
print('image_id  image_link')
display(similarity_list[s_idx][n_idx]['img'])

In [None]:
# Processing to convert to data frame
data = pd.DataFrame()
for s_idx in range(0,len(result.index)):
    for n_idx in range(0, len(similarity_list[s_idx])):
        temp = pd.DataFrame.from_dict(similarity_list[s_idx][n_idx])
        data = data.append(temp)

data = data.reset_index()
data.rename(columns = {'index':'img_id'}, inplace = True)
new_cols = ["noun","img","img_id"]
data=data[new_cols]

In [None]:
# Download dataframe as csv
data.to_csv('images.csv')
files.download("images.csv")

In [None]:
while True:pass

In [3]:
os. getcwd() 

'/content'