# CLIP Embeddings and NLP prompt

If you run the [Generate Embeddings Workflow](https://docs.aperturedata.io/workflows/embeddings_extraction), 
embeddings will be computed for each image and PDF present on ApertureDB, using the "ViT-B/16" model from [OpenAI CLIP](https://openai.com/index/clip/).

We can query these using a natural language prompt.

In [None]:
from aperturedb.CommonLibrary import create_connector
from aperturedb import NotebookHelpers as nh

# This will only work if you have apererturedb installed and configured.
# The configuration is either created by setting an APERTUREDB_KEY environment variable,
# or by creating a configuration using adb config.
# See https://docs.aperturedata.io/Setup/client/adb for more information.
client = create_connector(key="Get this key from the ApertureDB instance")

# If you wish to explicitly use the Connector class, you can do so like this:
#from aperturedb import Connector as Connector
#client = Connector.Connector(host="<DB_HOST>", user="admin", password="<YOUR_PASSWORD_HERE>")

response, _ = client.query([{"GetStatus": {}}])
client.print_last_response()


## Find images similar to text

Assuming that we have imported some images,
we can search for them using a text caption.

In [None]:
import clip

model, preprocess = clip.load("ViT-B/16", device="cpu")

# Natural language prompt
prompt = "a photo of a group of more than 2 people"
# prompt = "a photo of people with dogs"
# prompt = "a photo of a happy baby"

search_tokens = clip.tokenize([prompt]).to("cpu")
search_embeddings = model.encode_text(search_tokens)

blobs = search_embeddings[0].detach().numpy().tobytes()

query = [{
    "FindDescriptor": {
        "_ref": 1,
        "k_neighbors": 10,
        "set": "wf_embeddings_clip",
    }
}, {
    "FindImage": {
        "_ref": 2,
        "blobs": True,
        "is_connected_to": {
            "ref": 1
        },
        "results": {
            "limit": 10
        }
    }
}]

r, blobs = client.query(query, [blobs])
client.print_last_response()

print(len(blobs))
nh.display(blobs)

## Find text similar to text

This assumes that you have text segments, perhaps ingested from PDFs, and used something like the "Generate Embeddings" workflow. 

In [None]:
from aperturedb import NotebookHelpers as nh
import clip

model, preprocess = clip.load("ViT-B/16", device="cpu")
model.eval()

# Natural language prompt
prompt = "insert descriptive phrase here"

search_tokens = clip.tokenize([prompt]).to("cpu")
search_embeddings = model.encode_text(search_tokens)
descriptor_set = "wf_embeddings_clip_text"

blobs = search_embeddings[0].detach().numpy().tobytes()

query = [{
    "FindDescriptor": {
        "k_neighbors": 10,
        "set": descriptor_set,
        "results": {"all_properties": True},
    }
}]

r, _ = client.query(query, [blobs])
# client.print_last_response()
entities = r[0]["FindDescriptor"]["entities"]
for e in entities:
    print(f"{e.get('title')} page {e.get('page_number')}\n{e.get('text')}\n")

## Find Parts in video most closely resembling the input prompt.

This assumes that the instance has atleast 1 video already ingested. 

In [None]:
from aperturedb import NotebookHelpers as nh
import clip
from aperturedb.CommonLibrary import execute_query

model, preprocess = clip.load("ViT-B/16", device="cpu")
model.eval()

# Natural language prompt
prompt = "Descriptive prompt here"

search_tokens = clip.tokenize([prompt]).to("cpu")
search_embeddings = model.encode_text(search_tokens)
descriptor_set = "wf_embeddings_clip_video"

blobs = search_embeddings[0].detach().numpy().tobytes()

query = [{
    "FindDescriptor": {
        "k_neighbors": 10,
        "set": descriptor_set,
        "results": {"all_properties": True},
        "_ref": 1
    }
}, {
    "FindClip": {
        "_ref": 2,
        "is_connected_to": {
            "ref": 1
        },
        "results": {
            "group_by_source": True,
            "all_properties": True
        },
    }
},{
    "FindVideo": {
        "is_connected_to": {
            "ref": 2
        },
        "results": {
            "all_properties": True
        }
    }
}
]



status, r, cblobs = execute_query(client, query, [blobs])
assert status == 0, r

descriptors = r[0]["FindDescriptor"]["entities"] # This is a list of descriptors
clips = r[1]["FindClip"]["entities"] # This is a dictionary of source -> list of clips
videos = r[2]["FindVideo"]["entities"]
print(f"{len(clips)=} {len(videos)=}")

# Get a mapping of descriptor id to clip id
clip_ids = {d_id: c[0].get('_uniqueid') for d_id, c in clips.items()}
video_ids = [v.get('_uniqueid') for v in videos]


# Show the clips for the first 5 descriptors
for d in descriptors[0:5]:
    d_id = d.get('_uniqueid')
    clip_id = clip_ids[d_id]
    q = [{
        "FindClip": {
            "constraints": {
                "_uniqueid": ["==", clip_id]
            },
            "results": {
                "all_properties": True
            },
            "blobs": True
        }
    }]
    status, r, cblobs = execute_query(client, q, [])
    assert status == 0, r
    nh.display_video_mp4(cblobs[0])



## Show the full videos the clips are derived from

In [None]:
# Show the clips for the first 5 descriptors
retrieved_videos = set()
for v_id in video_ids[0:5]:
    q = [{
        "FindVideo": {
            "constraints": {
                "_uniqueid": ["==", v_id]
            },
            "results": {
                "all_properties": True
            },
            "blobs": True
        }
    }]

    if v_id not in retrieved_videos:
        status, r, vblobs = execute_query(client, q, [])
        assert status == 0, r
        nh.display_video_mp4(vblobs[0])
        retrieved_videos.add(v_id)