<a href="https://colab.research.google.com/github/ankesh86/InteractDiffusionSmallData/blob/main/MICO/MICO_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import zipfile

# Path to the ZIP file
zip_file_path = 'mico.zip'
# Directory where the contents will be extracted
extract_to = '/content/'

# Unzipping the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("File unzipped successfully!")


File unzipped successfully!


In [4]:
pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-gs6shgbr
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-gs6shgbr
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Using cached ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Using cached ftfy-6.3.1-py3-none-any.whl (44 kB)
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369489 sha256=3f9e27c5633068d07bf10ccadf471c6f4f234aaddb825f3dc48c0a2776817d4b
  Stored in directory: /tmp/pip-ephem-wheel-cache-kz53afey/wheels/da/2b/4c/d6691fa9597aac8bb85d2ac13b112deb897d5b50f5ad9a37e4
Successfully built clip
Installing collected packages: ftfy, clip
Successfully installed

In [5]:
import torch
from transformers import CLIPProcessor, CLIPModel
import json
from tqdm import tqdm
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms
import os


In [6]:
# Define the category and action mappings
category_id_to_name = {
    1: 'Monkey',
    2: 'Banana',
    3: 'Tree',
    4: 'Water',
    5: 'Rock',
    6: 'Flower',
    7: 'Fruit',
    8: 'Baby Monkey',
}

action_id_to_name = {
    1: 'Eating',
    2: 'Climbing',
    3: 'Drinking',
    4: 'Sitting',
    5: 'Smelling',
    6: 'Swinging',
    7: 'Holding',
    8: 'Carrying',
}


In [7]:
def get_category_name(id):
    return category_id_to_name.get(id, 'unknown')

def get_action_name(id):
    return action_id_to_name.get(id, 'unknown')

def xywh_to_xyxy(bbox):
    x, y, w, h = bbox
    return [x, y, x + w, y + h]

def xyxy_to_xywh(bbox):
    xmin, ymin, xmax, ymax = bbox
    return [xmin, ymin, xmax - xmin, ymax - ymin]


In [8]:
class CustomHOIDataset(Dataset):
    def __init__(self, annotations_json_path, image_root):
        self.image_root = image_root
        self.annotations_json_path = annotations_json_path

        # Load annotations
        with open(annotations_json_path, 'r') as f:
            self.annotations = json.load(f)

        # Initialize CLIP model and processor
        version = "openai/clip-vit-large-patch14"
        self.model = CLIPModel.from_pretrained(version).cuda()
        self.processor = CLIPProcessor.from_pretrained(version)

        # Preprocess function for images
        self.preprocess = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                                 std=[0.26862954, 0.26130258, 0.27577711])
        ])

    def __getitem__(self, index):
        anno = self.annotations[index]
        img_id = anno['img_id']
        file_name = anno['file_name']
        image_path = os.path.join(self.image_root, file_name)
        image = Image.open(image_path).convert('RGB')

        # Get annotations
        annotations = anno['annotations']
        hois = anno['hoi_annotation']

        data_hois = []
        prompts = []

        for hoi in hois:
            subject_id = hoi['subject_id']
            object_id = hoi['object_id']
            action_id = hoi['category_id']

            subject_annotation = annotations[subject_id]
            object_annotation = annotations[object_id]

            subject_category_id = subject_annotation['category_id']
            object_category_id = object_annotation['category_id']

            subject_category_name = get_category_name(subject_category_id)
            object_category_name = get_category_name(object_category_id)
            action_name = get_action_name(action_id)

            # Convert bboxes to [xmin, ymin, xmax, ymax]
            subject_bbox = xywh_to_xyxy(subject_annotation['bbox'])
            object_bbox = xywh_to_xyxy(object_annotation['bbox'])

            # Crop images for subject and object
            subject_image_crop = self.preprocess(image.crop(subject_bbox))
            object_image_crop = self.preprocess(image.crop(object_bbox))

            # Compute maximum bounding box covering both subject and object
            max_bbox_coords = self.max_bbox(subject_bbox, object_bbox)
            action_image_crop = self.preprocess(image.crop(max_bbox_coords))

            prompt = f"a {subject_category_name} is {action_name} a {object_category_name}"
            prompts.append(prompt)

            # Prepare inputs for CLIP model
            with torch.no_grad():
                inputs = self.processor(
                    text=[subject_category_name, object_category_name, action_name],
                    images=[subject_image_crop, object_image_crop, action_image_crop],
                    return_tensors="pt",
                    padding=True
                )
                inputs = {k: v.cuda() for k, v in inputs.items()}
                outputs = self.model(**inputs)

            text_before_features = outputs.text_model_output.pooler_output
            text_after_features = outputs.text_embeds

            image_before_features = outputs.vision_model_output.pooler_output
            image_after_features = outputs.image_embeds

            data_hois.append({
                'subject_xywh': xyxy_to_xywh(subject_bbox),
                'object_xywh': xyxy_to_xywh(object_bbox),
                'action': action_name,
                'subject': subject_category_name,
                'object': object_category_name,

                'subject_text_embedding_before': text_before_features[0].cpu(),
                'subject_text_embedding_after': text_after_features[0].cpu(),
                'subject_image_embedding_before': image_before_features[0].cpu(),
                'subject_image_embedding_after': image_after_features[0].cpu(),
                'object_text_embedding_before': text_before_features[1].cpu(),
                'object_text_embedding_after': text_after_features[1].cpu(),
                'object_image_embedding_before': image_before_features[1].cpu(),
                'object_image_embedding_after': image_after_features[1].cpu(),
                'action_text_embedding_before': text_before_features[2].cpu(),
                'action_text_embedding_after': text_after_features[2].cpu(),
                'action_image_embedding_before': image_before_features[2].cpu(),
                'action_image_embedding_after': image_after_features[2].cpu()
            })
            del image_before_features, image_after_features, text_before_features, text_after_features, outputs, inputs

        return {
            'file_name': file_name,
            'img_id': img_id,
            'image': image,
            'data_id': img_id,
            'caption': ", ".join(prompts),
            'hois': data_hois
        }

    def __len__(self):
        return len(self.annotations)

    def max_bbox(self, box1, box2):
        return [
            min(box1[0], box2[0]),
            min(box1[1], box2[1]),
            max(box1[2], box2[2]),
            max(box1[3], box2[3])
        ]


In [11]:
# Paths to your dataset
dataset_root = 'mico'
annotations_json_path = annotation_path = os.path.join(dataset_root, 'annotations/test_mico.json')
image_root = os.path.join(dataset_root, "images", "test2015")
print(annotations_json_path)
print(image_root)
save_root = 'custom_dataset_clip_embeddings'
os.makedirs(save_root, exist_ok=True)


mico/annotations/test_mico.json
mico/images/test2015


In [13]:
# Create dataset and process it
dataset = CustomHOIDataset(annotations_json_path, image_root)

for d in tqdm(dataset):
    torch.save(d, os.path.join(save_root, f"embed_{d['img_id']}.clip.pt"))


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx