# Store Embeddings

## Setup creating image embeddings

In [8]:
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
from datasets import load_dataset

In [7]:
model_chkpoint = "openai/clip-vit-base-patch16"
model = AutoModel.from_pretrained(model_chkpoint)
processor = AutoImageProcessor.from_pretrained(model_chkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_chkpoint)

In [10]:
ds = load_dataset("jmhessel/newyorker_caption_contest", "explanation")
trainset = ds["train"]

## Set up PGVector
1. Create db `multimodal_rag`.
```shell
ॐ createdb -O pguser multimodal_rag 'Multimodal RAG demo'

# or if the createdb client does not know about the /run/postgresql socket
ॐ created -h /run/postgresql -O pguser multimodal_rag 'Multimodal RAG demo'
```

2. Install the pgvector extension. Even though the database owner is `pguser`, I'll need to log into the psql shell with an admin user (in this case with the same name as my linux user) -
```shell
ॐ psql --dbname multimodal_rag
```
And run -
```sql
CREATE EXTENSION vector;
```
 
3. Create the `images` table.
```sql
CREATE TABLE IF NOT EXISTS images (
 id serial CONSTRAINT i_pri_key PRIMARY KEY,
 filename varchar(1024),
 description text,
 uncanny_description text,
 embedding vector(512)
);
```

In [11]:
import psycopg as pg
from pgvector.psycopg import register_vector
import numpy as np

In [12]:
connstr = "dbname=multimodal_rag user=pguser"

In [28]:
# Delete any existing data
with pg.connect(connstr) as conn:
    conn.execute("DELETE FROM images")

In [29]:
insert_sql = """
INSERT INTO images (filename, description, uncanny_description, embedding)
VALUES (%s, %s, %s, %s)
"""

In [30]:
with pg.connect(connstr) as conn:
    register_vector(conn)
    with conn.cursor() as cur:
        for i in range(10):
            instance = trainset[i]
            filename = f"trainset[{i}].image"  # Mock the filename
            description = instance["image_description"]
            uncanny_description = instance["image_uncanny_description"]

            img = instance["image"]
            img_tensor = processor(img, return_tensors="pt")
            emb_tensor = model.get_image_features(**img_tensor)
            embedding = emb_tensor.detach().squeeze().numpy()

            cur.execute(insert_sql, (
                filename,
                description,
                uncanny_description,
                embedding
            ))

In [31]:
emb = None
with pg.connect(connstr) as conn:
    register_vector(conn)
    with conn.cursor() as cur:
        cur.execute("SELECT id, filename, embedding FROM images LIMIT 1")
        row = cur.fetchone()
        print(row[0], row[1])
        emb = embedding

51 trainset[0].image


In [32]:
type(emb)

numpy.ndarray

In [33]:
emb.shape

(512,)