# CLIP as Module `Grok`

In [8]:
from langchain_core.embeddings import Embeddings
from transformers import CLIPProcessor, CLIPModel
import torch
from langchain_community.vectorstores import FAISS
import json

import os
import glob
import torch
import open_clip # open_clip_torch
from PIL import Image
import faiss
from langchain_community.vectorstores import FAISS
from openai import AzureOpenAI
import numpy as np
from base64 import b64encode

In [9]:

class CLIPImageEmbeddings(Embeddings):
    def __init__(self, model_name="openai/clip-vit-base-patch32"):
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def embed_documents(self, texts):
        # For text-based documents (not used here)
        inputs = self.processor(text=texts, return_tensors="pt", padding=True, truncation=True).to(self.device)
        with torch.no_grad():
            embeddings = self.model.get_text_features(**inputs).cpu().numpy()
        return embeddings.tolist()

    def embed_query(self, text):
        inputs = self.processor(text=[text], return_tensors="pt", padding=True, truncation=True).to(self.device)
        with torch.no_grad():
            embedding = self.model.get_text_features(**inputs).cpu().numpy()[0]
        return embedding.tolist()

    def embed_image(self, image_path):
        image = Image.open(image_path)
        inputs = self.processor(images=image, return_tensors="pt").to(self.device)
        with torch.no_grad():
            embedding = self.model.get_image_features(**inputs).cpu().numpy()[0]
        return embedding.tolist()



In [10]:
# Update encode_and_store_content to use image embeddings
def encode_and_store_content(text_data, image_data, table_data, output_folder="original_docs"):
    clip_model = CLIPImageEmbeddings()
    documents = []
    metadata = []
    
    # Text
    for item in text_data:
        doc = Document(page_content=item["content"], metadata={"type": "text", "page": item["page"]})
        documents.append(doc)
        metadata.append({"type": "text", "page": item["page"], "content": item["content"]})

    # Images
    for item in image_data:
        doc = Document(page_content=item["ocr_text"], metadata={"type": "image", "page": item["page"], "path": item["path"]})
        documents.append(doc)
        metadata.append({"type": "image", "page": item["page"], "path": item["path"], "ocr_text": item["ocr_text"]})

    # Tables
    for item in table_data:
        with open(item["path"], "r") as f:
            table_text = f.read()
        doc = Document(page_content=table_text, metadata={"type": "table", "page": item["page"], "path": item["path"]})
        documents.append(doc)
        metadata.append({"type": "table", "page": item["page"], "path": item["path"]})

    vector_store = FAISS.from_documents(documents, clip_model)
    vector_store.save_local("faiss_index")
    with open("metadata.json", "w") as f:
        json.dump(metadata, f)

    return clip_model, vector_store

In [11]:
model = CLIPImageEmbeddings()

In [12]:
embeds = model.embed_image('images/09a9f4be-8057-43d7-bdf2-a20404ab2165.png')