# Using BEG-VL-Base model to embed

In [29]:
from transformers import AutoProcessor, AutoModel
import torch

MODEL_NAME = "BAAI/BGE-VL-base" # or "BAAI/BGE-VL-large"

device = "cuda" if torch.cuda.is_available() else "cpu"
dense_model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True).to(device) # You must set trust_remote_code=True
dense_model_processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
dense_model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [30]:
from PIL import Image

def get_text_embeddings(texts: list[str]) -> list:
    if not texts:
        return []
    inputs = dense_model_processor(text=texts, return_tensors="pt", truncation=True, padding=True).to(device)
    return dense_model.get_text_features(**inputs).cpu()

def get_image_embeddings(image_paths: list[str]) -> list:
    if not image_paths:
        return []
    images = [Image.open(image_path).convert("RGB") for image_path in image_paths]
    inputs = dense_model_processor(images=images, return_tensors="pt").to(device)
    return dense_model.get_image_features(**inputs).cpu()

In [31]:
texts = ["Hello, world!", "This is a test sentence."]

In [32]:
text_embeddings = get_text_embeddings(texts)

In [33]:
# images = [
#     r"output\Objectifying_China\auto\images\0bf9d41010da900a0abb7048118e147c6e962eec73c6962affcf498cec014420.jpg",
#     r"output\Objectifying_China\auto\images\0bf9d41010da900a0abb7048118e147c6e962eec73c6962affcf498cec014420.jpg"
# ]
# image_embeddings = get_image_embeddings(images)

In [34]:
# Load model directly
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

device = "cuda" if torch.cuda.is_available() else "cpu"
sparse_model_tokenizer = AutoTokenizer.from_pretrained("naver/splade-v3")
sparse_model = AutoModelForMaskedLM.from_pretrained("naver/splade-v3").to(device)

In [35]:
def get_sparse_embeddings(texts: list[str]):
    if not texts:
        return []
    tokens = sparse_model_tokenizer(texts, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = sparse_model(**tokens)
    sparse_embedding = torch.max(torch.log(1 + torch.relu(outputs.logits)) * tokens.attention_mask.unsqueeze(-1), dim=1)[0].detach().cpu()
    
    # convert to pinecone sparse format
    res = []
    for i in range(len(sparse_embedding)):
        indices = sparse_embedding[i].nonzero().squeeze().tolist()
        values = sparse_embedding[i, indices].tolist()
        res.append({"indices": indices, "values": values})
    return res

In [36]:
text_embeddings = get_sparse_embeddings(texts)

In [37]:
import os
import json

In [120]:
batch_size = 32

def generate_embedding_file(input_file, output_dir=None):
    if output_dir is None:
        output_dir = os.path.dirname(input_file)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(input_file, 'r', encoding='utf-8') as f:
        contents = json.load(f)

    sections = {}
    for section_id, section in contents.items():
        new_section = []
        text_batch = []
        image_batch = []
        
        for num, content in enumerate(section):  
            # processing text batchs
            if len(text_batch) >= batch_size or num == len(section) - 1:
                texts = section['text']
                dense_embeddings = get_text_embeddings(texts)
                sparse_embeddings = get_sparse_embeddings(texts)
                for idx, content in enumerate(text_batch):
                    content['dense_embeddings'] = dense_embeddings[idx].tolist()
                    # adjusting sparse vector format into dict
                    content['sparse_embeddings'] = sparse_embeddings[idx]
                new_section.extend(text_batch)
                text_batch = []

            # processing image batchs
            if len(image_batch) >= batch_size or num == len(section) - 1:
                image_paths = section['img_path']
                dense_embeddings = get_image_embeddings(image_paths)
                for idx, content in enumerate(image_batch):
                    content['dense_embeddings'] = dense_embeddings[idx].tolist()
                new_section.extend(image_batch)
                image_batch = []

            # next batch
            text_batch.append(content)
            if(len(section['img_path']) != 0):
                image_batch.append(content)
                
            # # preparing for batches
            # if content['type'] == 'text':
            #     text_batch.append(content)
            # elif content['type'] == 'image':
            #     image_batch.append(content)

        sections[section_id] = new_section

    output_filename = "embeddings.json"
    output_file = os.path.join(output_dir, output_filename)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(sections, f, ensure_ascii=False, indent=2)

In [127]:
generate_embedding_file("output/Objectifying_China/tagged/en_contents_doc_chunked.json", 
                        "output/Objectifying_China/embeddings")

In [122]:
def generate_embedding_file(input_file, output_dir=None):
    import os, json
    if output_dir is None:
        output_dir = os.path.dirname(input_file)
    os.makedirs(output_dir, exist_ok=True)

    with open(input_file, 'r', encoding='utf-8') as f:
        contents = json.load(f)

    # Process each section (each section is a dictionary)
    for section_id, section in contents.items():
        # Process text field
        raw_text = section.get("text", "")
        if raw_text:
            # get_text_embeddings expects a list of strings
            dense_txt = get_text_embeddings([raw_text])[0].tolist()
            sparse_txt = get_sparse_embeddings([raw_text])[0]
            section["dense_text_embedding"] = dense_txt
            section["sparse_text_embedding"] = sparse_txt

        # Process any images if present (assuming "img_path" is a list)
        image_paths = section.get("img_path", [])
        if image_paths:
            dense_image_embeddings = get_image_embeddings(image_paths)
            # Convert each tensor to a list if needed
            section["dense_image_embeddings"] = [emb.tolist() for emb in dense_image_embeddings]

    # Write back the results to a new file name
    base = os.path.splitext(os.path.basename(input_file))[0]
    out_path = os.path.join(output_dir, f"{base}_embeddings.json")
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(contents, f, ensure_ascii=False, indent=2)

In [133]:
import json 

path = "output/Objectifying_China/embeddings/en_contents_doc_chunked_embeddings.json"

with open(path, 'r', encoding='utf-8') as f:
    data = json.load(f)
    
print(list(data.items())[50])

('fcxhje1w', {'id': 'fcxhje1w', 'type': 'section', 'header': 'Vase', 'text': '', 'page_idx': [], 'img_path': [], 'img_caption': [], 'img_footnote': [], 'time_period': [], 'materiality': [], 'region': [], 'colour': [], 'purpose': [], 'themes': [], 'exhibit': 'Objectifying China'})
