In [1]:
import torch
import cv2
import os
import re
from matplotlib import pyplot as plt
from PIL import Image
from transformers import pipeline
from torch.utils.data import Dataset, DataLoader
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer, DetrFeatureExtractor, DetrForObjectDetection
import timm  # needed for resnet

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
encoder_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
encoder_feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

gpt2_generator = pipeline('text-generation', model='gpt2')
# not doing object detection for now due to issues with paintings and cartoon-like images
# obj_feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50")
# obj_dect_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

encoder_model.to(device)
#obj_dect_model.to(device)
print()

Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 665/665 [00:00<00:00, 420kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 548M/548M [00:32<00:00, 16.7MB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 1.68MB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 740kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.36M/1.36M [00




In [21]:
class ImageDataset(Dataset):

    def __init__(self, image_dir):
        """Initialize the attributes of the object of the class."""
        self.image_dir = image_dir
        image_path_jpeg = sorted(self.find_files(image_dir, pattern=".jpeg"))
        image_path_png = sorted(self.find_files(image_dir, pattern=".png"))
        self.image_path_list = image_path_jpeg + image_path_png

    def __len__(self):
        """Return the size of the dataset."""
        return len(self.image_path_list)

    def __getitem__(self, index):
        """Return a data sample (=image) for a given index, along with the name of the corresponding pokemon."""
        
        return self.image_path_list[index]        
    
    def find_files(self, directory, pattern=".jpg"):
        """
        Recursive search method to find files. Credit to Paul Magron and Andrea de Marco
        for OG method
        """

        return  [f.path for f in os.scandir(directory) if f.path.endswith(pattern)]

In [22]:
def predictive_step(im_path, im_reader="PIL", max_len=16, num_beams=4):
    gen_kwargs = {"max_length": max_len, "num_beams": num_beams}
    all_ims = []
    for im in im_path:
        #i_image = Image.open(im)        
        i_image = cv2.imread(im)        
        #if i_image.mode != "RGB":            
            #i_image = i_image.convert(mode="RGB")
        all_ims.append(i_image)
    
    pixel_values = encoder_feature_extractor(images=all_ims, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    output_ids = encoder_model.generate(pixel_values, **gen_kwargs)
    
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

In [23]:
og_images_path = ImageDataset("./images_to_test_sys")

#predictive_step()

In [24]:
sample = og_images_path[2:3]
print(sample)
predictive_step(sample)  # passing as a list to be able to iterate

['./images_to_test_sys/original_demo_image_0.png']


['a river flowing through a forest filled with trees']

In [25]:
img = Image.open(sample[0])  # we use PIL for demo purposes
img.show()

In [40]:
print(img.size)
print(img.size[::-1])
#[image.size[::-1]]

(526, 800)
(800, 526)


In [26]:
nameRegex = re.compile(r"\./images_to_test_sys(/.*)\.\w+$")
mo = nameRegex.search(sample[0])
image_name = mo.group(1)

In [27]:
image = cv2.imread(sample[0])

# Old version.

Try running the cell from next cell

In [171]:
# OPENCV's method based on Omar's and Alexio's Algorithm
cuts_path = "./output_cropped_images/"
hgt, wid, channels = image.shape
#img.show()
print(wid, hgt)
nb_row = 3
nbr = 1
Wim = wid//nb_row
Him = hgt//nb_row

total_tiles = {}
group_tiles = []
all_titles = []
for k in range(1, nb_row+1):
    for i in range(0, nb_row-k+1):
        for j in range(0, nb_row-k+1):
            
            y = Him*j
            corr_height = Him * (j+k)
            #y1 = corr_height + nb_row
            x = Wim*i
            corr_width = Wim * (i+k)
            #x1 = corr_width + nbr                         
            height_cut = corr_height - y
            width_cut = corr_width - x
            cut_dims = (height_cut, width_cut)
            if cut_dims not in total_tiles.keys():
                group_tiles = []
                
            # coordinates are wrong, double check them with team
            top_left_pixel = (x, corr_height)
            bottom_right_pixel = (y, corr_width)
            coordinates = [top_left_pixel, bottom_right_pixel]
            
            tiles = image[y:corr_height, x:corr_width]

            #group_tiles.append(tiles)  # we need the dims to draw the rectangles, not the tiles
            group_tiles.append(coordinates)
            total_tiles.setdefault(cut_dims, group_tiles)
            
            title = cuts_path + image_name + '_opencv_cut'+str(nbr)+'.jpeg'       
                        
            # cv2.imwrite('saved_patches/'+'tile'+str(x)+'_'+str(y)+'.jpg', tiles)
            cv2.imwrite(title, tiles)
            #cv2.rectangle(image, (y, corr_height), (x, corr_width), (0, 255, 0), 1)            
            all_titles.append(title)
            
            nbr += 1           
#plt.imshow(image[:, :, ::-1])    

526 800


# Refactored Omar's and Alexio's cutting version of the algorithm

In [28]:
def imcrop(img, bbox): 
    x1,y1,x2,y2 = bbox
    if x1 < 0 or y1 < 0 or x2 > img.shape[1] or y2 > img.shape[0]:
        img, x1, x2, y1, y2 = pad_img_to_fit_bbox(img, x1, x2, y1, y2)    
    return img[y1:y2, x1:x2, :]

def pad_img_to_fit_bbox(img, x1, x2, y1, y2):
    img = np.pad(img, ((np.abs(np.minimum(0, y1)), np.maximum(y2 - img.shape[0], 0)),
               (np.abs(np.minimum(0, x1)), np.maximum(x2 - img.shape[1], 0)), (0,0)), mode="constant")
    y2 += np.abs(np.minimum(0, y1))
    y1 += np.abs(np.minimum(0, y1))
    x2 += np.abs(np.minimum(0, x1))
    x1 += np.abs(np.minimum(0, x1))
    
    return img, x1, x2, y1, y2

In [29]:
cuts_path = "./output_cropped_images/"
hgt, wid, channels = image.shape
#img.show()
print(wid, hgt)
nb_row = 3
nbr = 1
Wim = wid//nb_row
Him = hgt//nb_row

all_titles = []

for k in range(1, nb_row+1):
    for i in range(0, nb_row-k+1):
        for j in range(0, nb_row-k+1):

            bbox = (Wim*i, Him*j, Wim*(i+k), Him*(j+k))
            tiles = imcrop(image, bbox)

            title = cuts_path + image_name + '_opencv_cut'+str(nbr)+'.jpeg'       
            #title = cuts_path + image_name + '_opencv_cut'+str(nbr)+'.jpg'       
                        
            
            cv2.imwrite(title, tiles)
            #cv2.rectangle(image, (y, corr_height), (x, corr_width), (0, 255, 0), 1)            
            all_titles.append(title)
            nbr += 1    

526 800


In [30]:
# for k in total_tiles:
#     print('-'*8)
#     image_copy = image.copy() 
#     for v in total_tiles[k]: 
#         cv2.rectangle(image_copy, v[0], v[1], (0, 255, 0), 1)
#         print(v)
#     cv2.imshow("Patched Image",image_copy)
#     cv2.waitKey(5000)
#     cv2.destroyAllWindows()
    
    #plt.imshow(image_copy[:, :, ::-1])

In [31]:
liste_captions = predictive_step(all_titles)

In [32]:
liste_captions

['a forest filled with lots of trees and shrubs',
 'a blurry photo of some rocks and trees',
 'a blurry picture of a rocky area with trees and rocks',
 'a painting of a forest with trees and a river',
 'a waterfall that is in the middle of a forest',
 'a river with a bunch of animals in it',
 'a bird perched on top of a tree branch',
 'a forest filled with lots of trees and bushes',
 'a river with a bunch of trees on top of it',
 'a river filled with lots of trees and shrubbery',
 'a river filled with lots of water and trees',
 'a stream of water flowing through a forested area',
 'a river filled with lots of water and trees',
 'a river flowing through a forest filled with trees']

In [12]:
#https://towardsdatascience.com/an-intuitive-explanation-of-sentence-bert-1984d144a868
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [13]:
sentences = ['a forest filled with lots of trees and shrubs', 'a blurry photo of some rocks and trees',
             'a blurry picture of a rocky area with trees and rocks', 'a painting of a forest with trees and a river',
             'a waterfall that is in the middle of a forest', 'a river with a bunch of animals in it',
             'a bird perched on top of a tree branch', 'a forest filled with lots of trees and bushes',
             'a river with a bunch of trees on top of it', 'a river filled with lots of trees and shrubbery',
             'a river filled with lots of water and trees', 'a stream of water flowing through a forested area',
             'a river filled with lots of water and trees', 'a river flowing through a forest filled with trees']

In [19]:
#encode the sentences 
embeddings = model.encode(sentences, convert_to_tensor=True)#compute the similarity scores
cosine_scores = util.cos_sim(embeddings, embeddings)#compute/find the highest similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i] 
                                                             [j]})#sort the scores in decreasing order 
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
print(len(pairs))
#for pair in pairs[0:10]:
for pair in pairs:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i],
                                  sentences[j], pair['score']))

91
a river filled with lots of water and trees 		 a river filled with lots of water and trees 		 Score: 1.0000
a forest filled with lots of trees and shrubs 		 a forest filled with lots of trees and bushes 		 Score: 0.9904
a blurry photo of some rocks and trees 		 a blurry picture of a rocky area with trees and rocks 		 Score: 0.9507
a river filled with lots of trees and shrubbery 		 a river filled with lots of water and trees 		 Score: 0.9368
a river filled with lots of trees and shrubbery 		 a river filled with lots of water and trees 		 Score: 0.9368
a river with a bunch of trees on top of it 		 a river filled with lots of water and trees 		 Score: 0.9193
a river with a bunch of trees on top of it 		 a river filled with lots of water and trees 		 Score: 0.9193
a river with a bunch of trees on top of it 		 a river filled with lots of trees and shrubbery 		 Score: 0.9055
a river with a bunch of trees on top of it 		 a river flowing through a forest filled with trees 		 Score: 0.8889
a

# IGNORE FOR NOW

In [33]:
crops = ImageDataset(cuts_path)
for cut_im in crops:
    print(cut_im)

./output_cropped_images/original_demo_image_0_opencv_cut1.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut10.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut11.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut12.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut13.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut14.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut2.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut3.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut4.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut5.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut6.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut7.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut8.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut9.jpeg


In [43]:
cut_image = cv2.imread(crops[0])

labels = cut_image.shape[:2]

cut_image = Image.open(crops[0])

inputs = obj_feature_extractor(images=cut_image, return_tensors="pt")
for i in inputs:
    inputs[i] = inputs[i].to(device)

outputs = obj_dect_model(**inputs)

# convert outputs (bounding boxes and class logits) to COCO API
target_sizes = torch.tensor([labels[::-1]])

results = obj_feature_extractor.post_process(outputs, target_sizes=target_sizes)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    # let's only keep detections with score > 0.9
    if score > 0.9:
        print(
            f"Detected {obj_dect_model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
        )

{'scores': tensor([0.0124, 0.0487, 0.0713, 0.0348, 0.0545, 0.0453, 0.0177, 0.0089, 0.0049,
        0.0034, 0.0715, 0.0487, 0.0356, 0.0308, 0.0391, 0.0095, 0.0015, 0.0041,
        0.0166, 0.0399, 0.0323, 0.0526, 0.0062, 0.0286, 0.0494, 0.0479, 0.0117,
        0.0203, 0.0185, 0.0027, 0.0076, 0.0224, 0.0158, 0.0058, 0.0210, 0.0485,
        0.0173, 0.0376, 0.0110, 0.0073, 0.0112, 0.0107, 0.0441, 0.0222, 0.0065,
        0.0321, 0.0100, 0.0532, 0.0085, 0.0091, 0.0050, 0.0492, 0.0161, 0.0422,
        0.0415, 0.0302, 0.0225, 0.0425, 0.0469, 0.0528, 0.0337, 0.0408, 0.0031,
        0.0042, 0.0050, 0.0297, 0.0014, 0.0130, 0.0127, 0.0072, 0.0036, 0.0338,
        0.0320, 0.0384, 0.0339, 0.0809, 0.0341, 0.0028, 0.0472, 0.0057, 0.0252,
        0.0471, 0.0130, 0.0299, 0.0127, 0.0183, 0.0122, 0.0528, 0.0044, 0.0149,
        0.0416, 0.0406, 0.0459, 0.0429, 0.0432, 0.0324, 0.0016, 0.0150, 0.0414,
        0.0497], grad_fn=<UnbindBackward0>), 'labels': tensor([23, 25, 23, 25, 25, 25, 23, 23, 23, 25, 23, 23

In [None]:
# PIL
# (0, 0, 175, 266)
# (0, 266, 175, 532)
# (0, 532, 175, 798)
# (175, 0, 350, 266)
# (175, 266, 350, 532)
# (175, 532, 350, 798)
# (350, 0, 525, 266)
# (350, 266, 525, 532)
# (350, 532, 525, 798)
# (0, 0, 350, 532)
# (0, 266, 350, 798)
# (175, 0, 525, 532)
# (175, 266, 525, 798)
# (0, 0, 525, 798)
# opencv
# [(266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (532, 350, 3),
#  (532, 350, 3),
#  (532, 350, 3),
#  (532, 350, 3),
#  (798, 525, 3)]