In [1]:
import torch
import cv2
import os
import re
from matplotlib import pyplot as plt
from PIL import Image
from transformers import pipeline
from torch.utils.data import Dataset, DataLoader
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer, DetrFeatureExtractor, DetrForObjectDetection
import timm  # needed for resnet

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
encoder_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
encoder_feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

gpt2_generator = pipeline('text-generation', model='gpt2')
# not doing object detection for now due to issues with paintings and cartoon-like images
# obj_feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50")
# obj_dect_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

encoder_model.to(device)
#obj_dect_model.to(device)
print()




In [5]:
class ImageDataset(Dataset):

    def __init__(self, image_dir):
        """Initialize the attributes of the object of the class."""
        self.image_dir = image_dir
        image_path_jpeg = sorted(self.find_files(image_dir, pattern=".jpeg"))
        image_path_png = sorted(self.find_files(image_dir, pattern=".png"))
        self.image_path_list = image_path_jpeg + image_path_png

    def __len__(self):
        """Return the size of the dataset."""
        return len(self.image_path_list)

    def __getitem__(self, index):
        """Return a data sample (=image) for a given index, along with the name of the corresponding pokemon."""
        
        return self.image_path_list[index]        
    
    def find_files(self, directory, pattern=".jpg"):
        """
        Recursive search method to find files. Credit to Paul Magron and Andrea de Marco
        for OG method
        """

        return  [f.path for f in os.scandir(directory) if f.path.endswith(pattern)]

In [4]:
def predictive_step(im_path, im_reader="PIL", max_len=16, num_beams=4):
    gen_kwargs = {"max_length": max_len, "num_beams": num_beams}
    all_ims = []
    for im in im_path:
        #i_image = Image.open(im)        
        i_image = cv2.imread(im)        
        #if i_image.mode != "RGB":            
            #i_image = i_image.convert(mode="RGB")
        all_ims.append(i_image)
    
    pixel_values = encoder_feature_extractor(images=all_ims, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    output_ids = encoder_model.generate(pixel_values, **gen_kwargs)
    
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

In [6]:
og_images_path = ImageDataset("./images_to_test_sys")

#predictive_step()

In [20]:
sample = og_images_path[2:3]  # './images_to_test_sys/renoir_painting.jpeg'
print(sample)
sample = ["./images_to_test_sys/original_demo_image_2.png"]
predictive_step(sample)  # passing as a list to be able to iterate

['./images_to_test_sys/renoir_painting.jpeg']


['a painting of a boat in the middle of a lake']

In [9]:
img = Image.open(sample[0])  # we use PIL for demo purposes
img.show()

In [40]:
print(img.size)
print(img.size[::-1])
#[image.size[::-1]]

(526, 800)
(800, 526)


In [21]:
nameRegex = re.compile(r"\./images_to_test_sys(/.*)\.\w+$")
mo = nameRegex.search(sample[0])
image_name = mo.group(1)

In [22]:
#image = cv2.imread(sample[0], cv2.IMREAD_GRAYSCALE)
image = cv2.imread(sample[0])

cv2.imshow("Patched Image",image)
cv2.waitKey(5000)
cv2.destroyAllWindows()

In [11]:
image.shape

(400, 292, 3)

# Old version.

Try running the cell from next cell

In [171]:
# OPENCV's method based on Omar's and Alexio's Algorithm
cuts_path = "./output_cropped_images/"
hgt, wid, channels = image.shape
#img.show()
print(wid, hgt)
nb_row = 3
nbr = 1
Wim = wid//nb_row
Him = hgt//nb_row

total_tiles = {}
group_tiles = []
all_titles = []
for k in range(1, nb_row+1):
    for i in range(0, nb_row-k+1):
        for j in range(0, nb_row-k+1):
            
            y = Him*j
            corr_height = Him * (j+k)
            #y1 = corr_height + nb_row
            x = Wim*i
            corr_width = Wim * (i+k)
            #x1 = corr_width + nbr                         
            height_cut = corr_height - y
            width_cut = corr_width - x
            cut_dims = (height_cut, width_cut)
            if cut_dims not in total_tiles.keys():
                group_tiles = []
                
            # coordinates are wrong, double check them with team
            top_left_pixel = (x, corr_height)
            bottom_right_pixel = (y, corr_width)
            coordinates = [top_left_pixel, bottom_right_pixel]
            
            tiles = image[y:corr_height, x:corr_width]

            #group_tiles.append(tiles)  # we need the dims to draw the rectangles, not the tiles
            group_tiles.append(coordinates)
            total_tiles.setdefault(cut_dims, group_tiles)
            
            title = cuts_path + image_name + '_opencv_cut'+str(nbr)+'.jpeg'       
                        
            # cv2.imwrite('saved_patches/'+'tile'+str(x)+'_'+str(y)+'.jpg', tiles)
            cv2.imwrite(title, tiles)
            #cv2.rectangle(image, (y, corr_height), (x, corr_width), (0, 255, 0), 1)            
            all_titles.append(title)
            
            nbr += 1           
#plt.imshow(image[:, :, ::-1])    

526 800


# Refactored Omar's and Alexio's cutting version of the algorithm

In [23]:
def imcrop(img, bbox): 
    x1,y1,x2,y2 = bbox
    if x1 < 0 or y1 < 0 or x2 > img.shape[1] or y2 > img.shape[0]:
        img, x1, x2, y1, y2 = pad_img_to_fit_bbox(img, x1, x2, y1, y2)  
    if len(img.shape) == 3:
        return img[y1:y2, x1:x2, :]
    else:
        return img[y1:y2, x1:x2]

def pad_img_to_fit_bbox(img, x1, x2, y1, y2):
    img = np.pad(img, ((np.abs(np.minimum(0, y1)), np.maximum(y2 - img.shape[0], 0)),
               (np.abs(np.minimum(0, x1)), np.maximum(x2 - img.shape[1], 0)), (0,0)), mode="constant")
    y2 += np.abs(np.minimum(0, y1))
    y1 += np.abs(np.minimum(0, y1))
    x2 += np.abs(np.minimum(0, x1))
    x1 += np.abs(np.minimum(0, x1))
    
    return img, x1, x2, y1, y2

In [24]:
cuts_path = "./output_cropped_images/"
hgt, wid, channels = image.shape #this is if reading in color mode
#hgt, wid = image.shape # black and white
#img.show()
print(wid, hgt)
nb_row = 3
nbr = 1
Wim = wid//nb_row
Him = hgt//nb_row

all_titles = []

for k in range(1, nb_row+1):
    for i in range(0, nb_row-k+1):
        for j in range(0, nb_row-k+1):

            bbox = (Wim*i, Him*j, Wim*(i+k), Him*(j+k))
            tiles = imcrop(image, bbox)

            title = cuts_path + image_name + '_opencv_cut'+str(nbr)+'.jpeg'       
            #title = cuts_path + image_name + '_opencv_cut'+str(nbr)+'.jpg'       
                        
            
            cv2.imwrite(title, tiles)
            #cv2.rectangle(image, (y, corr_height), (x, corr_width), (0, 255, 0), 1)            
            all_titles.append(title)
            nbr += 1    

507 348


In [30]:
# for k in total_tiles:
#     print('-'*8)
#     image_copy = image.copy() 
#     for v in total_tiles[k]: 
#         cv2.rectangle(image_copy, v[0], v[1], (0, 255, 0), 1)
#         print(v)
#     cv2.imshow("Patched Image",image_copy)
#     cv2.waitKey(5000)
#     cv2.destroyAllWindows()
    
    #plt.imshow(image_copy[:, :, ::-1])

In [25]:
liste_captions = predictive_step(all_titles)

In [26]:
liste_captions

['an old photograph of a person standing in front of a white background',
 'a painting of a group of animals on a wall',
 'a painting of a river with trees',
 'a black and white photo of some clouds',
 'a snowy landscape with a train on the tracks',
 'a painting of a group of people on a beach',
 'a painting of a vase with flowers in it',
 'a painting of a tree filled with lots of flowers',
 'a painting of a river with a bunch of animals',
 'a painting of a group of animals on a hillside',
 'a painting of a group of animals in a field',
 'a painting of flowers and birds on a wall',
 'a painting of a river with a bunch of trees',
 'a painting of a boat in the middle of a lake']

In [44]:
#https://towardsdatascience.com/an-intuitive-explanation-of-sentence-bert-1984d144a868
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [45]:
sentences = ['a forest filled with lots of trees and shrubs', 'a blurry photo of some rocks and trees',
             'a blurry picture of a rocky area with trees and rocks', 'a painting of a forest with trees and a river',
             'a waterfall that is in the middle of a forest', 'a river with a bunch of animals in it',
             'a bird perched on top of a tree branch', 'a forest filled with lots of trees and bushes',
             'a river with a bunch of trees on top of it', 'a river filled with lots of trees and shrubbery',
             'a river filled with lots of water and trees', 'a stream of water flowing through a forested area',
             'a river filled with lots of water and trees', 'a river flowing through a forest filled with trees']

sentences = sorted(set(liste_captions))

In [53]:
#encode the sentences 
embeddings = model.encode(sentences, convert_to_tensor=True)#compute the similarity scores
cosine_scores = util.cos_sim(embeddings, embeddings)#compute/find the highest similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i] 
                                                             [j]})#sort the scores in decreasing order 
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
print(len(pairs))

91


In [58]:
embeddings.shape
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

#input_sentence = "They were there to enjoy us and they were there to pray for us."

model = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"
model = model.to(device)
tokenizer = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')
batch = tokenizer(input_text, return_tensors='pt')
generated_ids = model.generate(batch['input_ids'])
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

print(generated_text)


Downloading:   0%|                                  | 0.00/1.63G [00:00<?, ?B/s][A
Downloading:   0%|                        | 52.2k/1.63G [00:00<1:09:03, 392kB/s][A
Downloading:   0%|                          | 218k/1.63G [00:00<26:22, 1.03MB/s][A
Downloading:   0%|                          | 471k/1.63G [00:00<16:24, 1.65MB/s][A
Downloading:   0%|                          | 897k/1.63G [00:00<10:19, 2.62MB/s][A
Downloading:   0%|                         | 1.77M/1.63G [00:00<05:41, 4.75MB/s][A
Downloading:   0%|                         | 3.24M/1.63G [00:00<03:21, 8.04MB/s][A
Downloading:   0%|                         | 4.75M/1.63G [00:00<02:37, 10.3MB/s][A
Downloading:   0%|                         | 6.30M/1.63G [00:00<02:15, 11.9MB/s][A
Downloading:   0%|                         | 7.85M/1.63G [00:00<02:04, 13.0MB/s][A
Downloading:   1%|▏                        | 9.57M/1.63G [00:01<01:53, 14.2MB/s][A
Downloading:   1%|▏                        | 11.4M/1.63G [00:01<01:45, 15.3

['A black and white photo of some clouds, a painting of a boat in the middle of']


In [59]:
src_sentences = []
target_sentences = []
for pair in pairs:
    i, j = pair['index']
    if 0.60 < pair["score"] < 0.90:  # avoiding redundant matches
        print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i],
                                  sentences[j], pair['score']))
        src_sentences.append(sentences[i])
        target_sentences.append(sentences[j])
all_sentences = list(set(src_sentences + target_sentences))

a painting of a group of animals in a field 		 a painting of a group of animals on a hillside 		 Score: 0.8973
a painting of a river with a bunch of animals 		 a painting of a river with a bunch of trees 		 Score: 0.8120
a painting of a group of animals in a field 		 a painting of a group of animals on a wall 		 Score: 0.7960
a painting of a river with a bunch of animals 		 a painting of a river with trees 		 Score: 0.7943
a painting of a group of animals on a hillside 		 a painting of a group of animals on a wall 		 Score: 0.7874
a painting of a boat in the middle of a lake 		 a painting of a river with trees 		 Score: 0.7781
a painting of a boat in the middle of a lake 		 a painting of a river with a bunch of trees 		 Score: 0.7715
a painting of a group of animals on a hillside 		 a painting of a river with a bunch of animals 		 Score: 0.7674
a painting of a group of animals in a field 		 a painting of a river with a bunch of animals 		 Score: 0.7620
a painting of a tree filled with 

In [63]:
all_sentences
input_text = '. '.join(all_sentences)
print(input_text)

a painting of a vase with flowers in it. a painting of a group of animals in a field. a painting of a group of animals on a wall. a painting of a group of animals on a hillside. a painting of flowers and birds on a wall. a painting of a river with a bunch of trees. a painting of a tree filled with lots of flowers. a painting of a boat in the middle of a lake. a painting of a river with a bunch of animals. a painting of a river with trees


In [64]:
model = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"
model = model.to(device)
tokenizer = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')
batch = tokenizer(input_text, return_tensors='pt')
generated_ids = model.generate(batch['input_ids'])
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

print(generated_text)

['A painting of a vase with flowers in it, a painting of animals in a field']


In [65]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(input_text, return_tensors='pt')
output = model(**encoded_input)

In [69]:
from transformers import pipeline, set_seed

generator = pipeline('text-generation', model='gpt2')

set_seed(42)

#generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)
generator(input_text, max_length=128, num_return_sequences=5)  # instead of 30

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'a painting of a vase with flowers in it. a painting of a group of animals in a field. a painting of a group of animals on a wall. a painting of a group of animals on a hillside. a painting of flowers and birds on a wall. a painting of a river with a bunch of trees. a painting of a tree filled with lots of flowers. a painting of a boat in the middle of a lake. a painting of a river with a bunch of animals. a painting of a river with trees. a painting of a river with a huge rock. a painting of a rock with a lot of trees'},
 {'generated_text': 'a painting of a vase with flowers in it. a painting of a group of animals in a field. a painting of a group of animals on a wall. a painting of a group of animals on a hillside. a painting of flowers and birds on a wall. a painting of a river with a bunch of trees. a painting of a tree filled with lots of flowers. a painting of a boat in the middle of a lake. a painting of a river with a bunch of animals. a painting of a river 

In [66]:
output

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-0.1559, -0.0341, -0.2607,  ..., -0.0745, -0.0490,  0.0385],
         [-0.1266,  0.5696, -1.1369,  ..., -0.0360,  0.2717, -0.1486],
         [ 0.0431,  0.1370,  0.0551,  ...,  0.0708,  0.0367, -0.2831],
         ...,
         [ 0.0406,  0.8717, -1.7076,  ..., -0.5687, -0.0784, -0.4305],
         [-0.2841,  0.3237, -0.6237,  ..., -0.0036, -0.6758, -0.2554],
         [ 0.0488,  0.7521, -1.5540,  ..., -0.3796, -0.1116,  0.3845]]],
       grad_fn=<ViewBackward0>), past_key_values=((tensor([[[[-1.2407,  2.2139,  0.5919,  ..., -1.1082, -0.8965,  1.7337],
          [-1.9038,  2.2632,  0.6652,  ..., -3.3237, -2.2190,  3.0071],
          [-2.3023,  3.0159,  1.5088,  ..., -0.8810, -1.9270,  2.5491],
          ...,
          [-0.2970,  2.2258,  1.7521,  ..., -1.9984, -1.3774, -0.1998],
          [-0.6048,  2.7840,  2.2150,  ..., -0.5087, -0.6721,  0.9208],
          [ 0.5030,  3.1326,  2.3630,  ..., -2.1833, -0.5170, -0.6311]],

In [42]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-distilroberta-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-distilroberta-base-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=32, return_tensors='pt')  # I added max_length to reduce complexity

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

print("Sentence embeddings:")
print(sentence_embeddings, sentence_embeddings.shape)  # num sentences and embeddings?

Sentence embeddings:
tensor([[-0.4722,  1.0633, -0.1901,  ...,  0.1442, -0.0643,  0.5640],
        [-0.3990,  0.4795,  0.3016,  ...,  0.2571, -0.1955, -0.1905]]) torch.Size([2, 768])


In [None]:
help(tokenizer)

# IGNORE FOR NOW

In [33]:
crops = ImageDataset(cuts_path)
for cut_im in crops:
    print(cut_im)

./output_cropped_images/original_demo_image_0_opencv_cut1.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut10.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut11.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut12.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut13.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut14.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut2.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut3.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut4.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut5.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut6.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut7.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut8.jpeg
./output_cropped_images/original_demo_image_0_opencv_cut9.jpeg


In [43]:
cut_image = cv2.imread(crops[0])

labels = cut_image.shape[:2]

cut_image = Image.open(crops[0])

inputs = obj_feature_extractor(images=cut_image, return_tensors="pt")
for i in inputs:
    inputs[i] = inputs[i].to(device)

outputs = obj_dect_model(**inputs)

# convert outputs (bounding boxes and class logits) to COCO API
target_sizes = torch.tensor([labels[::-1]])

results = obj_feature_extractor.post_process(outputs, target_sizes=target_sizes)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    # let's only keep detections with score > 0.9
    if score > 0.9:
        print(
            f"Detected {obj_dect_model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
        )

{'scores': tensor([0.0124, 0.0487, 0.0713, 0.0348, 0.0545, 0.0453, 0.0177, 0.0089, 0.0049,
        0.0034, 0.0715, 0.0487, 0.0356, 0.0308, 0.0391, 0.0095, 0.0015, 0.0041,
        0.0166, 0.0399, 0.0323, 0.0526, 0.0062, 0.0286, 0.0494, 0.0479, 0.0117,
        0.0203, 0.0185, 0.0027, 0.0076, 0.0224, 0.0158, 0.0058, 0.0210, 0.0485,
        0.0173, 0.0376, 0.0110, 0.0073, 0.0112, 0.0107, 0.0441, 0.0222, 0.0065,
        0.0321, 0.0100, 0.0532, 0.0085, 0.0091, 0.0050, 0.0492, 0.0161, 0.0422,
        0.0415, 0.0302, 0.0225, 0.0425, 0.0469, 0.0528, 0.0337, 0.0408, 0.0031,
        0.0042, 0.0050, 0.0297, 0.0014, 0.0130, 0.0127, 0.0072, 0.0036, 0.0338,
        0.0320, 0.0384, 0.0339, 0.0809, 0.0341, 0.0028, 0.0472, 0.0057, 0.0252,
        0.0471, 0.0130, 0.0299, 0.0127, 0.0183, 0.0122, 0.0528, 0.0044, 0.0149,
        0.0416, 0.0406, 0.0459, 0.0429, 0.0432, 0.0324, 0.0016, 0.0150, 0.0414,
        0.0497], grad_fn=<UnbindBackward0>), 'labels': tensor([23, 25, 23, 25, 25, 25, 23, 23, 23, 25, 23, 23

In [None]:
# PIL
# (0, 0, 175, 266)
# (0, 266, 175, 532)
# (0, 532, 175, 798)
# (175, 0, 350, 266)
# (175, 266, 350, 532)
# (175, 532, 350, 798)
# (350, 0, 525, 266)
# (350, 266, 525, 532)
# (350, 532, 525, 798)
# (0, 0, 350, 532)
# (0, 266, 350, 798)
# (175, 0, 525, 532)
# (175, 266, 525, 798)
# (0, 0, 525, 798)
# opencv
# [(266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (266, 175, 3),
#  (532, 350, 3),
#  (532, 350, 3),
#  (532, 350, 3),
#  (532, 350, 3),
#  (798, 525, 3)]