In [31]:
!pip install -qqq transformers==4.28.1 --progress-bar off
!pip install -qqq bitsandbytes==0.38.1 --progress-bar off
!pip install -qqq accelerate==0.18.0 --progress-bar off
!pip install -qqq sentencepiece==0.1.99 --progress-bar off


In [32]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
checkpoint="unikei/t5-base-split-and-rephrase"

#####################################################################
import numpy as np
from tqdm import tqdm
import pickle
import torch

#####################################################################


device = 'cuda' if torch.cuda.is_available() else 'cpu'

print('Using {} device'.format(device))

tokenizer = T5Tokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint).to(device)


def sentence_rephraser(data, tokenizer, model, file_path):
    """ The function given the data in the pickle format
    rephrase the sentences.

    """

    for key in tqdm(data.keys()):

        # tokenize complex sentence
        complex_tokenized = tokenizer(  data[key]['caption'],
                                        padding="max_length",
                                        truncation=True,
                                        max_length=256,
                                        return_tensors='pt'
                                    )

        beam_size = 10 # represent a good trade-off between quality and diversity

        simple_tokenized = model.generate(complex_tokenized['input_ids'].to(device),
                                        attention_mask = complex_tokenized['attention_mask'].to(device),
                                        max_length=256,
                                        num_beams=beam_size,
                                        num_return_sequences=10-len(data[key]['caption'])
                                        )

        simple_sentences = tokenizer.batch_decode(simple_tokenized,
                                                skip_special_tokens=True
                                                )

        for s in simple_sentences:
            data[key]['caption'].append(s)

        if key % 100 == 0:
          # save checkpoint
          with open(file_path, 'wb') as f:
            pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

    return data



Using cuda device


Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

In [2]:
# connect colab
from google.colab import drive

drive.mount('/content/drive/')


Mounted at /content/drive/


In [None]:
# # load test dataset
# file_name = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_train.p'
# with open(file_name, 'rb') as f:
#     data_train = pickle.load(f)

# data_train = {key: data_train[key] for key in data_train.keys() if key > 9100 and key < 20000}

In [None]:
# file_path = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_train_rephrased_9100.p'

# data_test = sentence_rephraser(data_train, tokenizer, model, file_path)

# with open(file_path, 'wb') as f:
#   pickle.dump(data_test, f, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 10899/10899 [3:22:05<00:00,  1.11s/it]


In [33]:
# load test dataset
file_name = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_val.p'
with open(file_name, 'rb') as f:
    data_val = pickle.load(f)

file_path = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_val_rephrased.p'

data_val = sentence_rephraser(data_val, tokenizer, model, file_path)

with open(file_path, 'wb') as f:
  pickle.dump(data_val, f, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 2573/2573 [47:25<00:00,  1.11s/it]


In [None]:
# # load test dataset
# file_name = '/content/drive/MyDrive/refCOCOg Visual Grounding/yolov8x+clip/yolo_v8x_1_dictionary_full_train.p'
# with open(file_name, 'rb') as f:
#     data_test = pickle.load(f)

# data_test = sentence_rephraser(data_test, tokenizer, model)

# read and embed the sentences produced

Here we rephrase the sentences that we have produced

In [19]:
# merge the two separate preprocessed training

import pickle

# file_path1 = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_train_rephrased.p'
# file_path2 = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_train_rephrased_9100.p'

# with open(file_path1, 'rb') as f:
#     data_train1 = pickle.load(f)

# with open(file_path2, 'rb') as f:
#     data_train2 = pickle.load(f)

# data_train = data_train1

# del data_train1

# for k, v in data_train2.items():
#   data_train[k] = v

# del data_train2


# with open(file_path1, 'wb') as f:
#   pickle.dump(data_train, f, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
# connect colab
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [1]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-ibooztva
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-ibooztva
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-n

In [34]:
# the following function will embed the rephrased sentences
import torch
import matplotlib.pyplot as plt
from torchvision import transforms as T
import clip
from matplotlib.patches import Rectangle
import numpy as np
from PIL import Image
import time
import pandas as pd


device = 'cuda' if torch.cuda.is_available() else 'cpu'
clip_model, preprocess = clip.load("ViT-B/32", device=device)


def sentence_cut(pool_sentences, t = 240):
  return [s[:t] if len(s) < t else s for s in sentence_to_cut]

def get_dict_clip_emb(clip_model, dictionary):
    """ Get dictionary add the embedding of the rephrased sentences

    Args:
        clip_model (CLIP): CLIP model
        img_preproc (torch.tensor): preprocessed crops embedding
        text (list): tokenized text

    Returns:
        dict: dictionary with CLIP embeddings and text scores

    """
    d_emb_texscores = {}

    images = dictionary['image_emb'].to(device)
    captions = clip.tokenize(sentence_cut(dictionary['caption'])).to(device)

    with torch.no_grad():
        # get CLIP embeddings
        text_features = clip_model.encode_text(captions).type(torch.float16)

    # normalize
    text_features /= text_features.norm(dim=-1, keepdim=True)
    images /= images.norm(dim=-1, keepdim=True)

    # store them
    dictionary['text_emb'] = text_features.cpu()
    dictionary['image_emb'] = images.cpu()
    # print( text_features.dtype, images.dtype)
    dictionary['text_similarity'] = (100.0 * text_features @ images.T).softmax(dim=-1).cpu()

    return dictionary

In [26]:
from tqdm import tqdm
import pickle

# load the full training dataset
file_path1 = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_train_rephrased.p'

with open(file_path1, 'rb') as f:
    data_train = pickle.load(f)

data_train = {k: get_dict_clip_emb(clip_model, v) for k, v in tqdm(data_train.items())}

file_path1 = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_train_rephrased_final.p'

with open(file_path1, 'wb') as f:
  pickle.dump(data_train, f, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 8/20000 [11:06<462:47:29, 83.34s/it]
100%|██████████| 20000/20000 [05:11<00:00, 64.12it/s]


In [27]:
# load the full training dataset
file_path1 = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_test_rephrased.p'

with open(file_path1, 'rb') as f:
    data_test = pickle.load(f)

data_test = {k: get_dict_clip_emb(clip_model, v) for k, v in tqdm(data_test.items())}

file_path1 = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_test_rephrased_final.p'

with open(file_path1, 'wb') as f:
  pickle.dump(data_test, f, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 5023/5023 [01:17<00:00, 64.57it/s]


In [35]:
# load the full training dataset
file_path = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_val_rephrased.p'

with open(file_path, 'rb') as f:
    data_val = pickle.load(f)

data_val = {k: get_dict_clip_emb(clip_model, v) for k, v in tqdm(data_val.items())}

file_path1 = '/content/drive/MyDrive/deeplearning/refcocog/yolov8x+clip/yolo_v8x_1_dictionary_full_val_rephrased_final.p'

with open(file_path1, 'wb') as f:
  pickle.dump(data_val, f, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 2573/2573 [00:41<00:00, 61.31it/s]
